test/CodeGen/AMDGPU/merge-stores.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
   2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
   3
   4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   6
   7 ; Run with devices with different unaligned load restrictions.
   8
   9 ; TODO: Vector element tests
  10 ; TODO: Non-zero base offset for load and store combinations
  11 ; TODO: Same base addrspacecasted
  12
  13
  14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
  15 ; GCN: buffer_store_byte
  16 ; GCN: buffer_store_byte
  17 ; GCN: s_endpgm
  18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  19   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  20
  21   store i8 123, i8 addrspace(1)* %out.gep.1
  22   store i8 456, i8 addrspace(1)* %out, align 2
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
  27 ; GCN: buffer_store_byte
  28 ; GCN: buffer_store_byte
  29 ; GCN: s_endpgm
  30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  31   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  32
  33   store i8 123, i8 addrspace(1)* %out.gep.1
  34   store i8 456, i8 addrspace(1)* %out
  35   ret void
  36 }
  37
  38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
  39 ; GCN: buffer_store_dword v
  40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  41   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  42
  43   store i16 123, i16 addrspace(1)* %out.gep.1
  44   store i16 456, i16 addrspace(1)* %out, align 4
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
  49 ; GCN: buffer_store_dword v
  50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  51   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  52
  53   store i16 0, i16 addrspace(1)* %out.gep.1
  54   store i16 0, i16 addrspace(1)* %out, align 4
  55   ret void
  56 }
  57
  58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
  59 ; GCN: buffer_store_short
  60 ; GCN: buffer_store_short
  61 ; GCN: s_endpgm
  62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  63   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  64
  65   store i16 123, i16 addrspace(1)* %out.gep.1
  66   store i16 456, i16 addrspace(1)* %out
  67   ret void
  68 }
  69
  70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
  71 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
  72 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
  73 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  74 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  75   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  76
  77   store i32 123, i32 addrspace(1)* %out.gep.1
  78   store i32 456, i32 addrspace(1)* %out
  79   ret void
  80 }
  81
  82 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
  83 ; GCN: buffer_store_dwordx2
  84 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  85   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  86   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  87   store float 1.0, float addrspace(1)* %out.gep.1.bc
  88   store i32 456, i32 addrspace(1)* %out
  89   ret void
  90 }
  91
  92 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
  93 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
  94 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
  95 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
  96 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
  97   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  98   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
  99   store i32 123, i32 addrspace(1)* %out.gep.1.bc
 100   store float 4.0, float addrspace(1)* %out
 101   ret void
 102 }
 103
 104 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
 105 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
 106 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
 107 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 108 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 109 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
 110 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 111   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 112   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 113   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 114
 115   store i32 123, i32 addrspace(1)* %out.gep.1
 116   store i32 456, i32 addrspace(1)* %out.gep.2
 117   store i32 333, i32 addrspace(1)* %out.gep.3
 118   store i32 1234, i32 addrspace(1)* %out
 119   ret void
 120 }
 121
 122 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 123 ; GCN: buffer_store_dwordx4
 124 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 125   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 126   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 127   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 128
 129   store float 8.0, float addrspace(1)* %out
 130   store float 1.0, float addrspace(1)* %out.gep.1
 131   store float 2.0, float addrspace(1)* %out.gep.2
 132   store float 4.0, float addrspace(1)* %out.gep.3
 133   ret void
 134 }
 135
 136 ; First store is out of order.
 137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 138 ; GCN: buffer_store_dwordx4
 139 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 140   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 141   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 142   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 143
 144   store float 1.0, float addrspace(1)* %out.gep.1
 145   store float 2.0, float addrspace(1)* %out.gep.2
 146   store float 4.0, float addrspace(1)* %out.gep.3
 147   store float 8.0, float addrspace(1)* %out
 148   ret void
 149 }
 150
 151 ; FIXME: Should be able to merge this
 152 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 153 ; GCN-NOAA: buffer_store_dword v
 154 ; GCN-NOAA: buffer_store_dword v
 155 ; GCN-NOAA: buffer_store_dword v
 156 ; GCN-NOAA: buffer_store_dword v
 157
 158 ; GCN-AA: buffer_store_dwordx2
 159 ; GCN-AA: buffer_store_dword v
 160 ; GCN-AA: buffer_store_dword v
 161
 162 ; GCN: s_endpgm
 163 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 164   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 165   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 166   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 167
 168   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 169   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 170
 171   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 172   store float 2.0, float addrspace(1)* %out.gep.2
 173   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 174   store float 8.0, float addrspace(1)* %out
 175   ret void
 176 }
 177
 178 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 179 ; SI-DAG: buffer_store_dwordx2
 180 ; SI-DAG: buffer_store_dword
 181 ; SI-NOT: buffer_store_dword
 182 ; GCN: s_endpgm
 183 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 184   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 185   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 186
 187   store i32 123, i32 addrspace(1)* %out.gep.1
 188   store i32 456, i32 addrspace(1)* %out.gep.2
 189   store i32 1234, i32 addrspace(1)* %out
 190   ret void
 191 }
 192
 193 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 194 ; GCN: buffer_store_dwordx4
 195 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 196   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 197
 198   store i64 123, i64 addrspace(1)* %out.gep.1
 199   store i64 456, i64 addrspace(1)* %out
 200   ret void
 201 }
 202
 203 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 204 ; GCN: buffer_store_dwordx4
 205 ; GCN: buffer_store_dwordx4
 206 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 207   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 208   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 209   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 210
 211   store i64 123, i64 addrspace(1)* %out.gep.1
 212   store i64 456, i64 addrspace(1)* %out.gep.2
 213   store i64 333, i64 addrspace(1)* %out.gep.3
 214   store i64 1234, i64 addrspace(1)* %out
 215   ret void
 216 }
 217
 218 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 219 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 220 ; GCN: buffer_store_dwordx2 [[LOAD]]
 221 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 222   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 223   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 224
 225   %lo = load i32, i32 addrspace(1)* %in
 226   %hi = load i32, i32 addrspace(1)* %in.gep.1
 227
 228   store i32 %lo, i32 addrspace(1)* %out
 229   store i32 %hi, i32 addrspace(1)* %out.gep.1
 230   ret void
 231 }
 232
 233 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 234 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 235 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 236 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 237   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 238   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 239
 240   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 241   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 242   %lo = load i32, i32 addrspace(1)* %in.gep.0
 243   %hi = load i32, i32 addrspace(1)* %in.gep.1
 244
 245   store i32 %lo, i32 addrspace(1)* %out.gep.0
 246   store i32 %hi, i32 addrspace(1)* %out.gep.1
 247   ret void
 248 }
 249
 250 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 251 ; GCN: buffer_load_dword v
 252 ; GCN: buffer_load_dword v
 253 ; GCN: buffer_store_dword v
 254 ; GCN: buffer_store_dword v
 255 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 256   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 257   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 258
 259   %lo = load i32, i32 addrspace(1)* %in
 260   %hi = load i32, i32 addrspace(1)* %in.gep.1
 261
 262   store i32 %hi, i32 addrspace(1)* %out
 263   store i32 %lo, i32 addrspace(1)* %out.gep.1
 264   ret void
 265 }
 266
 267 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 268 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 269 ; GCN: buffer_store_dwordx4 [[LOAD]]
 270 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 271   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 272   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 273   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 274   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 275   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 276   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 277
 278   %x = load i32, i32 addrspace(1)* %in
 279   %y = load i32, i32 addrspace(1)* %in.gep.1
 280   %z = load i32, i32 addrspace(1)* %in.gep.2
 281   %w = load i32, i32 addrspace(1)* %in.gep.3
 282
 283   store i32 %x, i32 addrspace(1)* %out
 284   store i32 %y, i32 addrspace(1)* %out.gep.1
 285   store i32 %z, i32 addrspace(1)* %out.gep.2
 286   store i32 %w, i32 addrspace(1)* %out.gep.3
 287   ret void
 288 }
 289
 290 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
 291 ; SI-DAG: buffer_load_dwordx2
 292 ; SI-DAG: buffer_load_dword v
 293 ; GCN: s_waitcnt
 294 ; SI-DAG: buffer_store_dword v
 295 ; SI-DAG: buffer_store_dwordx2 v
 296 ; GCN: s_endpgm
 297 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 298   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 299   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 300   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 301   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 302
 303   %x = load i32, i32 addrspace(1)* %in
 304   %y = load i32, i32 addrspace(1)* %in.gep.1
 305   %z = load i32, i32 addrspace(1)* %in.gep.2
 306
 307   store i32 %x, i32 addrspace(1)* %out
 308   store i32 %y, i32 addrspace(1)* %out.gep.1
 309   store i32 %z, i32 addrspace(1)* %out.gep.2
 310   ret void
 311 }
 312
 313 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 314 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 315 ; GCN: buffer_store_dwordx4 [[LOAD]]
 316 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 317   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 318   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 319   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 320   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 321   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 322   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 323
 324   %x = load float, float addrspace(1)* %in
 325   %y = load float, float addrspace(1)* %in.gep.1
 326   %z = load float, float addrspace(1)* %in.gep.2
 327   %w = load float, float addrspace(1)* %in.gep.3
 328
 329   store float %x, float addrspace(1)* %out
 330   store float %y, float addrspace(1)* %out.gep.1
 331   store float %z, float addrspace(1)* %out.gep.2
 332   store float %w, float addrspace(1)* %out.gep.3
 333   ret void
 334 }
 335
 336 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 337 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 338 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 339 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 340   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 341   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 342   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 343   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 344   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 345   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 346   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 347   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 348
 349   %x = load i32, i32 addrspace(1)* %in.gep.0
 350   %y = load i32, i32 addrspace(1)* %in.gep.1
 351   %z = load i32, i32 addrspace(1)* %in.gep.2
 352   %w = load i32, i32 addrspace(1)* %in.gep.3
 353
 354   store i32 %x, i32 addrspace(1)* %out.gep.0
 355   store i32 %y, i32 addrspace(1)* %out.gep.1
 356   store i32 %z, i32 addrspace(1)* %out.gep.2
 357   store i32 %w, i32 addrspace(1)* %out.gep.3
 358   ret void
 359 }
 360
 361 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
 362 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 363 ; GCN: s_barrier
 364 ; GCN: buffer_store_dwordx4 [[LOAD]]
 365 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 366   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 367   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 368   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 369   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 370   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 371   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 372
 373   %x = load i32, i32 addrspace(1)* %in
 374   %y = load i32, i32 addrspace(1)* %in.gep.1
 375   %z = load i32, i32 addrspace(1)* %in.gep.2
 376   %w = load i32, i32 addrspace(1)* %in.gep.3
 377
 378   ; Make sure the barrier doesn't stop this
 379   tail call void @llvm.AMDGPU.barrier.local() #1
 380
 381   store i32 %w, i32 addrspace(1)* %out.gep.3
 382   store i32 %z, i32 addrspace(1)* %out.gep.2
 383   store i32 %y, i32 addrspace(1)* %out.gep.1
 384   store i32 %x, i32 addrspace(1)* %out
 385
 386   ret void
 387 }
 388
 389 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 390 ; should catch this?
 391
 392 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
 393 ; GCN: buffer_load_dword v
 394 ; GCN: buffer_load_dword v
 395 ; GCN: buffer_load_dword v
 396 ; GCN: buffer_load_dword v
 397 ; GCN: s_barrier
 398 ; GCN: buffer_store_dword v
 399 ; GCN: buffer_store_dword v
 400 ; GCN: buffer_store_dword v
 401 ; GCN: buffer_store_dword v
 402 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 403   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 404   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 405   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 406   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 407   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 408   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 409
 410   %x = load i32, i32 addrspace(1)* %in
 411   %y = load i32, i32 addrspace(1)* %in.gep.1
 412   %z = load i32, i32 addrspace(1)* %in.gep.2
 413   %w = load i32, i32 addrspace(1)* %in.gep.3
 414
 415   ; Make sure the barrier doesn't stop this
 416   tail call void @llvm.AMDGPU.barrier.local() #1
 417
 418   store i32 %w, i32 addrspace(1)* %out
 419   store i32 %z, i32 addrspace(1)* %out.gep.1
 420   store i32 %y, i32 addrspace(1)* %out.gep.2
 421   store i32 %x, i32 addrspace(1)* %out.gep.3
 422
 423   ret void
 424 }
 425
 426 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
 427 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 428 ; GCN: buffer_store_dword [[LOAD]]
 429 ; GCN: s_endpgm
 430 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 431   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 432   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 433   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 434   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 435   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 436   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 437
 438   %x = load i8, i8 addrspace(1)* %in, align 4
 439   %y = load i8, i8 addrspace(1)* %in.gep.1
 440   %z = load i8, i8 addrspace(1)* %in.gep.2
 441   %w = load i8, i8 addrspace(1)* %in.gep.3
 442
 443   store i8 %x, i8 addrspace(1)* %out, align 4
 444   store i8 %y, i8 addrspace(1)* %out.gep.1
 445   store i8 %z, i8 addrspace(1)* %out.gep.2
 446   store i8 %w, i8 addrspace(1)* %out.gep.3
 447   ret void
 448 }
 449
 450 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
 451 ; GCN: buffer_load_ubyte
 452 ; GCN: buffer_load_ubyte
 453 ; GCN: buffer_load_ubyte
 454 ; GCN: buffer_load_ubyte
 455 ; GCN: buffer_store_byte
 456 ; GCN: buffer_store_byte
 457 ; GCN: buffer_store_byte
 458 ; GCN: buffer_store_byte
 459 ; GCN: s_endpgm
 460 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 461   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 462   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 463   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 464   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 465   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 466   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 467
 468   %x = load i8, i8 addrspace(1)* %in
 469   %y = load i8, i8 addrspace(1)* %in.gep.1
 470   %z = load i8, i8 addrspace(1)* %in.gep.2
 471   %w = load i8, i8 addrspace(1)* %in.gep.3
 472
 473   store i8 %x, i8 addrspace(1)* %out
 474   store i8 %y, i8 addrspace(1)* %out.gep.1
 475   store i8 %z, i8 addrspace(1)* %out.gep.2
 476   store i8 %w, i8 addrspace(1)* %out.gep.3
 477   ret void
 478 }
 479
 480 ; This works once AA is enabled on the subtarget
 481 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 482 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 483
 484 ; GCN-NOAA: buffer_store_dword v
 485 ; GCN-NOAA: buffer_store_dword v
 486 ; GCN-NOAA: buffer_store_dword v
 487 ; GCN-NOAA: buffer_store_dword v
 488
 489 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
 490
 491 ; GCN: s_endpgm
 492 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 493   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 494   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 495   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 496   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 497
 498   %x = extractelement <4 x i32> %vec, i32 0
 499   %y = extractelement <4 x i32> %vec, i32 1
 500   %z = extractelement <4 x i32> %vec, i32 2
 501   %w = extractelement <4 x i32> %vec, i32 3
 502
 503   store i32 %x, i32 addrspace(1)* %out
 504   store i32 %y, i32 addrspace(1)* %out.gep.1
 505   store i32 %z, i32 addrspace(1)* %out.gep.2
 506   store i32 %w, i32 addrspace(1)* %out.gep.3
 507   ret void
 508 }
 509
 510 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 511 ; GCN: ds_write_b8
 512 ; GCN: ds_write_b8
 513 ; GCN: s_endpgm
 514 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 515   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 516
 517   store i8 123, i8 addrspace(3)* %out.gep.1
 518   store i8 456, i8 addrspace(3)* %out, align 2
 519   ret void
 520 }
 521
 522 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
 523 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 524 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 525 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 526 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 527   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 528
 529   store i32 123, i32 addrspace(3)* %out.gep.1
 530   store i32 456, i32 addrspace(3)* %out
 531   ret void
 532 }
 533
 534 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
 535 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
 536 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
 537 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
 538
 539 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
 540 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
 541 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 542
 543 ; GCN: s_endpgm
 544 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 545   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 546   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 547   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 548
 549   store i32 123, i32 addrspace(3)* %out.gep.1
 550   store i32 456, i32 addrspace(3)* %out.gep.2
 551   store i32 333, i32 addrspace(3)* %out.gep.3
 552   store i32 1234, i32 addrspace(3)* %out
 553   ret void
 554 }
 555
 556 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 557 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 558 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
 559 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 560 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 561 ; GCN: buffer_store_dword v[[HI]]
 562 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 563   store i32 9, i32 addrspace(1)* %out, align 4
 564   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 565   store i32 12, i32 addrspace(1)* %idx1, align 4
 566   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 567   store i32 16, i32 addrspace(1)* %idx2, align 4
 568   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 569   store i32 -12, i32 addrspace(1)* %idx3, align 4
 570   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 571   store i32 11, i32 addrspace(1)* %idx4, align 4
 572   ret void
 573 }
 574
 575 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 576 ; GCN: buffer_store_dwordx4
 577 ; GCN: buffer_store_dwordx2
 578 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 579   store i32 13, i32 addrspace(1)* %out, align 4
 580   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 581   store i32 15, i32 addrspace(1)* %idx1, align 4
 582   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 583   store i32 62, i32 addrspace(1)* %idx2, align 4
 584   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 585   store i32 63, i32 addrspace(1)* %idx3, align 4
 586   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 587   store i32 11, i32 addrspace(1)* %idx4, align 4
 588   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 589   store i32 123, i32 addrspace(1)* %idx5, align 4
 590   ret void
 591 }
 592
 593 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 594 ; GCN: buffer_store_dwordx4
 595 ; GCN: buffer_store_dwordx2
 596 ; GCN: buffer_store_dword v
 597 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 598   store i32 34, i32 addrspace(1)* %out, align 4
 599   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 600   store i32 999, i32 addrspace(1)* %idx1, align 4
 601   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 602   store i32 65, i32 addrspace(1)* %idx2, align 4
 603   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 604   store i32 33, i32 addrspace(1)* %idx3, align 4
 605   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 606   store i32 98, i32 addrspace(1)* %idx4, align 4
 607   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 608   store i32 91, i32 addrspace(1)* %idx5, align 4
 609   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 610   store i32 212, i32 addrspace(1)* %idx6, align 4
 611   ret void
 612 }
 613
 614 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
 615 ; GCN: buffer_store_dwordx4
 616 ; GCN: buffer_store_dwordx4
 617 ; GCN: s_endpgm
 618 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 619   store i32 34, i32 addrspace(1)* %out, align 4
 620   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 621   store i32 999, i32 addrspace(1)* %idx1, align 4
 622   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 623   store i32 65, i32 addrspace(1)* %idx2, align 4
 624   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 625   store i32 33, i32 addrspace(1)* %idx3, align 4
 626   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 627   store i32 98, i32 addrspace(1)* %idx4, align 4
 628   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 629   store i32 91, i32 addrspace(1)* %idx5, align 4
 630   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 631   store i32 212, i32 addrspace(1)* %idx6, align 4
 632   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 633   store i32 999, i32 addrspace(1)* %idx7, align 4
 634   ret void
 635 }
 636
 637 declare void @llvm.AMDGPU.barrier.local() #1
 638
 639 attributes #0 = { nounwind }
 640 attributes #1 = { noduplicate nounwind }