test/CodeGen/AMDGPU/merge-stores.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
   2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
   3
   4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   6
   7 ; Run with devices with different unaligned load restrictions.
   8
   9 ; TODO: Vector element tests
  10 ; TODO: Non-zero base offset for load and store combinations
  11 ; TODO: Same base addrspacecasted
  12
  13
  14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
  15 ; GCN: buffer_store_byte
  16 ; GCN: buffer_store_byte
  17 ; GCN: s_endpgm
  18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  19   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  20
  21   store i8 123, i8 addrspace(1)* %out.gep.1
  22   store i8 456, i8 addrspace(1)* %out, align 2
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
  27 ; GCN: buffer_store_byte
  28 ; GCN: buffer_store_byte
  29 ; GCN: s_endpgm
  30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  31   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  32
  33   store i8 123, i8 addrspace(1)* %out.gep.1
  34   store i8 456, i8 addrspace(1)* %out
  35   ret void
  36 }
  37
  38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
  39 ; GCN: buffer_store_dword v
  40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  41   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  42
  43   store i16 123, i16 addrspace(1)* %out.gep.1
  44   store i16 456, i16 addrspace(1)* %out, align 4
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
  49 ; GCN: buffer_store_dword v
  50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  51   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  52
  53   store i16 0, i16 addrspace(1)* %out.gep.1
  54   store i16 0, i16 addrspace(1)* %out, align 4
  55   ret void
  56 }
  57
  58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
  59 ; GCN: buffer_store_short
  60 ; GCN: buffer_store_short
  61 ; GCN: s_endpgm
  62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  63   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  64
  65   store i16 123, i16 addrspace(1)* %out.gep.1
  66   store i16 456, i16 addrspace(1)* %out
  67   ret void
  68 }
  69
  70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
  71 ; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
  72 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
  73 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
  74 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
  75 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  76 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  77   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  78
  79   store i32 123, i32 addrspace(1)* %out.gep.1
  80   store i32 456, i32 addrspace(1)* %out
  81   ret void
  82 }
  83
  84 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
  85 ; GCN: buffer_store_dwordx2
  86 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  87   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  88   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  89   store float 1.0, float addrspace(1)* %out.gep.1.bc
  90   store i32 456, i32 addrspace(1)* %out
  91   ret void
  92 }
  93
  94 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
  95 ; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
  96 ; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
  97 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
  98 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
  99 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 100 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
 101   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 102   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 103   store i32 123, i32 addrspace(1)* %out.gep.1.bc
 104   store float 4.0, float addrspace(1)* %out
 105   ret void
 106 }
 107
 108 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
 109 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
 110 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
 111 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 112 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 113 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
 114 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 115   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 116   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 117   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 118
 119   store i32 123, i32 addrspace(1)* %out.gep.1
 120   store i32 456, i32 addrspace(1)* %out.gep.2
 121   store i32 333, i32 addrspace(1)* %out.gep.3
 122   store i32 1234, i32 addrspace(1)* %out
 123   ret void
 124 }
 125
 126 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 127 ; GCN: buffer_store_dwordx4
 128 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 129   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 130   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 131   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 132
 133   store float 8.0, float addrspace(1)* %out
 134   store float 1.0, float addrspace(1)* %out.gep.1
 135   store float 2.0, float addrspace(1)* %out.gep.2
 136   store float 4.0, float addrspace(1)* %out.gep.3
 137   ret void
 138 }
 139
 140 ; First store is out of order.
 141 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 142 ; GCN: buffer_store_dwordx4
 143 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 144   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 145   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 146   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 147
 148   store float 1.0, float addrspace(1)* %out.gep.1
 149   store float 2.0, float addrspace(1)* %out.gep.2
 150   store float 4.0, float addrspace(1)* %out.gep.3
 151   store float 8.0, float addrspace(1)* %out
 152   ret void
 153 }
 154
 155 ; FIXME: Should be able to merge this
 156 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 157 ; GCN-NOAA: buffer_store_dword v
 158 ; GCN-NOAA: buffer_store_dword v
 159 ; GCN-NOAA: buffer_store_dword v
 160 ; GCN-NOAA: buffer_store_dword v
 161
 162 ; GCN-AA: buffer_store_dwordx2
 163 ; GCN-AA: buffer_store_dword v
 164 ; GCN-AA: buffer_store_dword v
 165
 166 ; GCN: s_endpgm
 167 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 168   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 169   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 170   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 171
 172   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 173   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 174
 175   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 176   store float 2.0, float addrspace(1)* %out.gep.2
 177   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 178   store float 8.0, float addrspace(1)* %out
 179   ret void
 180 }
 181
 182 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 183 ; SI-DAG: buffer_store_dwordx2
 184 ; SI-DAG: buffer_store_dword
 185 ; SI-NOT: buffer_store_dword
 186 ; GCN: s_endpgm
 187 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 188   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 189   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 190
 191   store i32 123, i32 addrspace(1)* %out.gep.1
 192   store i32 456, i32 addrspace(1)* %out.gep.2
 193   store i32 1234, i32 addrspace(1)* %out
 194   ret void
 195 }
 196
 197 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 198 ; XGCN: buffer_store_dwordx4
 199 ; GCN: buffer_store_dwordx2
 200 ; GCN: buffer_store_dwordx2
 201 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 202   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 203
 204   store i64 123, i64 addrspace(1)* %out.gep.1
 205   store i64 456, i64 addrspace(1)* %out
 206   ret void
 207 }
 208
 209 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 210 ; XGCN: buffer_store_dwordx4
 211 ; XGCN: buffer_store_dwordx4
 212
 213 ; GCN: buffer_store_dwordx2
 214 ; GCN: buffer_store_dwordx2
 215 ; GCN: buffer_store_dwordx2
 216 ; GCN: buffer_store_dwordx2
 217 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 218   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 219   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 220   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 221
 222   store i64 123, i64 addrspace(1)* %out.gep.1
 223   store i64 456, i64 addrspace(1)* %out.gep.2
 224   store i64 333, i64 addrspace(1)* %out.gep.3
 225   store i64 1234, i64 addrspace(1)* %out
 226   ret void
 227 }
 228
 229 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 230 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 231 ; GCN: buffer_store_dwordx2 [[LOAD]]
 232 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 233   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 234   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 235
 236   %lo = load i32, i32 addrspace(1)* %in
 237   %hi = load i32, i32 addrspace(1)* %in.gep.1
 238
 239   store i32 %lo, i32 addrspace(1)* %out
 240   store i32 %hi, i32 addrspace(1)* %out.gep.1
 241   ret void
 242 }
 243
 244 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 245 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 246 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 247 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 248   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 249   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 250
 251   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 252   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 253   %lo = load i32, i32 addrspace(1)* %in.gep.0
 254   %hi = load i32, i32 addrspace(1)* %in.gep.1
 255
 256   store i32 %lo, i32 addrspace(1)* %out.gep.0
 257   store i32 %hi, i32 addrspace(1)* %out.gep.1
 258   ret void
 259 }
 260
 261 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 262 ; GCN: buffer_load_dword v
 263 ; GCN: buffer_load_dword v
 264 ; GCN: buffer_store_dword v
 265 ; GCN: buffer_store_dword v
 266 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 267   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 268   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 269
 270   %lo = load i32, i32 addrspace(1)* %in
 271   %hi = load i32, i32 addrspace(1)* %in.gep.1
 272
 273   store i32 %hi, i32 addrspace(1)* %out
 274   store i32 %lo, i32 addrspace(1)* %out.gep.1
 275   ret void
 276 }
 277
 278 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 279 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 280 ; GCN: buffer_store_dwordx4 [[LOAD]]
 281 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 282   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 283   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 284   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 285   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 286   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 287   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 288
 289   %x = load i32, i32 addrspace(1)* %in
 290   %y = load i32, i32 addrspace(1)* %in.gep.1
 291   %z = load i32, i32 addrspace(1)* %in.gep.2
 292   %w = load i32, i32 addrspace(1)* %in.gep.3
 293
 294   store i32 %x, i32 addrspace(1)* %out
 295   store i32 %y, i32 addrspace(1)* %out.gep.1
 296   store i32 %z, i32 addrspace(1)* %out.gep.2
 297   store i32 %w, i32 addrspace(1)* %out.gep.3
 298   ret void
 299 }
 300
 301 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
 302 ; SI-DAG: buffer_load_dwordx2
 303 ; SI-DAG: buffer_load_dword v
 304 ; GCN: s_waitcnt
 305 ; SI-DAG: buffer_store_dword v
 306 ; SI-DAG: buffer_store_dwordx2 v
 307 ; GCN: s_endpgm
 308 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 309   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 310   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 311   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 312   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 313
 314   %x = load i32, i32 addrspace(1)* %in
 315   %y = load i32, i32 addrspace(1)* %in.gep.1
 316   %z = load i32, i32 addrspace(1)* %in.gep.2
 317
 318   store i32 %x, i32 addrspace(1)* %out
 319   store i32 %y, i32 addrspace(1)* %out.gep.1
 320   store i32 %z, i32 addrspace(1)* %out.gep.2
 321   ret void
 322 }
 323
 324 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 325 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 326 ; GCN: buffer_store_dwordx4 [[LOAD]]
 327 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 328   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 329   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 330   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 331   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 332   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 333   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 334
 335   %x = load float, float addrspace(1)* %in
 336   %y = load float, float addrspace(1)* %in.gep.1
 337   %z = load float, float addrspace(1)* %in.gep.2
 338   %w = load float, float addrspace(1)* %in.gep.3
 339
 340   store float %x, float addrspace(1)* %out
 341   store float %y, float addrspace(1)* %out.gep.1
 342   store float %z, float addrspace(1)* %out.gep.2
 343   store float %w, float addrspace(1)* %out.gep.3
 344   ret void
 345 }
 346
 347 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 348 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 349 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 350 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 351   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 352   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 353   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 354   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 355   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 356   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 357   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 358   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 359
 360   %x = load i32, i32 addrspace(1)* %in.gep.0
 361   %y = load i32, i32 addrspace(1)* %in.gep.1
 362   %z = load i32, i32 addrspace(1)* %in.gep.2
 363   %w = load i32, i32 addrspace(1)* %in.gep.3
 364
 365   store i32 %x, i32 addrspace(1)* %out.gep.0
 366   store i32 %y, i32 addrspace(1)* %out.gep.1
 367   store i32 %z, i32 addrspace(1)* %out.gep.2
 368   store i32 %w, i32 addrspace(1)* %out.gep.3
 369   ret void
 370 }
 371
 372 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
 373 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 374 ; GCN: s_barrier
 375 ; GCN: buffer_store_dwordx4 [[LOAD]]
 376 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 377   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 378   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 379   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 380   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 381   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 382   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 383
 384   %x = load i32, i32 addrspace(1)* %in
 385   %y = load i32, i32 addrspace(1)* %in.gep.1
 386   %z = load i32, i32 addrspace(1)* %in.gep.2
 387   %w = load i32, i32 addrspace(1)* %in.gep.3
 388
 389   ; Make sure the barrier doesn't stop this
 390   tail call void @llvm.AMDGPU.barrier.local() #1
 391
 392   store i32 %w, i32 addrspace(1)* %out.gep.3
 393   store i32 %z, i32 addrspace(1)* %out.gep.2
 394   store i32 %y, i32 addrspace(1)* %out.gep.1
 395   store i32 %x, i32 addrspace(1)* %out
 396
 397   ret void
 398 }
 399
 400 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 401 ; should catch this?
 402
 403 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
 404 ; GCN: buffer_load_dword v
 405 ; GCN: buffer_load_dword v
 406 ; GCN: buffer_load_dword v
 407 ; GCN: buffer_load_dword v
 408 ; GCN: s_barrier
 409 ; GCN: buffer_store_dword v
 410 ; GCN: buffer_store_dword v
 411 ; GCN: buffer_store_dword v
 412 ; GCN: buffer_store_dword v
 413 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 414   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 415   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 416   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 417   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 418   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 419   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 420
 421   %x = load i32, i32 addrspace(1)* %in
 422   %y = load i32, i32 addrspace(1)* %in.gep.1
 423   %z = load i32, i32 addrspace(1)* %in.gep.2
 424   %w = load i32, i32 addrspace(1)* %in.gep.3
 425
 426   ; Make sure the barrier doesn't stop this
 427   tail call void @llvm.AMDGPU.barrier.local() #1
 428
 429   store i32 %w, i32 addrspace(1)* %out
 430   store i32 %z, i32 addrspace(1)* %out.gep.1
 431   store i32 %y, i32 addrspace(1)* %out.gep.2
 432   store i32 %x, i32 addrspace(1)* %out.gep.3
 433
 434   ret void
 435 }
 436
 437 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
 438 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 439 ; GCN: buffer_store_dword [[LOAD]]
 440 ; GCN: s_endpgm
 441 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 442   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 443   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 444   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 445   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 446   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 447   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 448
 449   %x = load i8, i8 addrspace(1)* %in, align 4
 450   %y = load i8, i8 addrspace(1)* %in.gep.1
 451   %z = load i8, i8 addrspace(1)* %in.gep.2
 452   %w = load i8, i8 addrspace(1)* %in.gep.3
 453
 454   store i8 %x, i8 addrspace(1)* %out, align 4
 455   store i8 %y, i8 addrspace(1)* %out.gep.1
 456   store i8 %z, i8 addrspace(1)* %out.gep.2
 457   store i8 %w, i8 addrspace(1)* %out.gep.3
 458   ret void
 459 }
 460
 461 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
 462 ; GCN: buffer_load_ubyte
 463 ; GCN: buffer_load_ubyte
 464 ; GCN: buffer_load_ubyte
 465 ; GCN: buffer_load_ubyte
 466 ; GCN: buffer_store_byte
 467 ; GCN: buffer_store_byte
 468 ; GCN: buffer_store_byte
 469 ; GCN: buffer_store_byte
 470 ; GCN: s_endpgm
 471 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 472   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 473   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 474   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 475   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 476   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 477   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 478
 479   %x = load i8, i8 addrspace(1)* %in
 480   %y = load i8, i8 addrspace(1)* %in.gep.1
 481   %z = load i8, i8 addrspace(1)* %in.gep.2
 482   %w = load i8, i8 addrspace(1)* %in.gep.3
 483
 484   store i8 %x, i8 addrspace(1)* %out
 485   store i8 %y, i8 addrspace(1)* %out.gep.1
 486   store i8 %z, i8 addrspace(1)* %out.gep.2
 487   store i8 %w, i8 addrspace(1)* %out.gep.3
 488   ret void
 489 }
 490
 491 ; This works once AA is enabled on the subtarget
 492 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 493 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 494
 495 ; GCN-NOAA: buffer_store_dword v
 496 ; GCN-NOAA: buffer_store_dword v
 497 ; GCN-NOAA: buffer_store_dword v
 498 ; GCN-NOAA: buffer_store_dword v
 499
 500 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
 501
 502 ; GCN: s_endpgm
 503 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 504   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 505   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 506   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 507   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 508
 509   %x = extractelement <4 x i32> %vec, i32 0
 510   %y = extractelement <4 x i32> %vec, i32 1
 511   %z = extractelement <4 x i32> %vec, i32 2
 512   %w = extractelement <4 x i32> %vec, i32 3
 513
 514   store i32 %x, i32 addrspace(1)* %out
 515   store i32 %y, i32 addrspace(1)* %out.gep.1
 516   store i32 %z, i32 addrspace(1)* %out.gep.2
 517   store i32 %w, i32 addrspace(1)* %out.gep.3
 518   ret void
 519 }
 520
 521 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 522 ; GCN: ds_write_b8
 523 ; GCN: ds_write_b8
 524 ; GCN: s_endpgm
 525 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 526   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 527
 528   store i8 123, i8 addrspace(3)* %out.gep.1
 529   store i8 456, i8 addrspace(3)* %out, align 2
 530   ret void
 531 }
 532
 533 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
 534 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 535 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 536 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 537 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 538   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 539
 540   store i32 123, i32 addrspace(3)* %out.gep.1
 541   store i32 456, i32 addrspace(3)* %out
 542   ret void
 543 }
 544
 545 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
 546 ; GCN: ds_write_b32
 547 ; GCN: ds_write_b32
 548 ; GCN: ds_write_b32
 549 ; GCN: ds_write_b32
 550 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 551   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 552   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 553   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 554
 555   store i32 123, i32 addrspace(3)* %out.gep.1
 556   store i32 456, i32 addrspace(3)* %out.gep.2
 557   store i32 333, i32 addrspace(3)* %out.gep.3
 558   store i32 1234, i32 addrspace(3)* %out
 559   ret void
 560 }
 561
 562 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 563 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 564 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
 565 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 566 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 567 ; GCN: buffer_store_dword v[[HI]]
 568 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 569   store i32 9, i32 addrspace(1)* %out, align 4
 570   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 571   store i32 12, i32 addrspace(1)* %idx1, align 4
 572   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 573   store i32 16, i32 addrspace(1)* %idx2, align 4
 574   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 575   store i32 -12, i32 addrspace(1)* %idx3, align 4
 576   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 577   store i32 11, i32 addrspace(1)* %idx4, align 4
 578   ret void
 579 }
 580
 581 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 582 ; GCN: buffer_store_dwordx4
 583 ; GCN: buffer_store_dwordx2
 584 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 585   store i32 13, i32 addrspace(1)* %out, align 4
 586   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 587   store i32 15, i32 addrspace(1)* %idx1, align 4
 588   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 589   store i32 62, i32 addrspace(1)* %idx2, align 4
 590   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 591   store i32 63, i32 addrspace(1)* %idx3, align 4
 592   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 593   store i32 11, i32 addrspace(1)* %idx4, align 4
 594   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 595   store i32 123, i32 addrspace(1)* %idx5, align 4
 596   ret void
 597 }
 598
 599 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 600 ; GCN: buffer_store_dwordx4
 601 ; GCN: buffer_store_dwordx2
 602 ; GCN: buffer_store_dword v
 603 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 604   store i32 34, i32 addrspace(1)* %out, align 4
 605   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 606   store i32 999, i32 addrspace(1)* %idx1, align 4
 607   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 608   store i32 65, i32 addrspace(1)* %idx2, align 4
 609   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 610   store i32 33, i32 addrspace(1)* %idx3, align 4
 611   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 612   store i32 98, i32 addrspace(1)* %idx4, align 4
 613   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 614   store i32 91, i32 addrspace(1)* %idx5, align 4
 615   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 616   store i32 212, i32 addrspace(1)* %idx6, align 4
 617   ret void
 618 }
 619
 620 ; FIXME: This should do 2 dwordx4 loads
 621 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
 622
 623 ; GCN-NOAA: buffer_store_dword v
 624 ; GCN-NOAA: buffer_store_dword v
 625 ; GCN-NOAA: buffer_store_dword v
 626 ; GCN-NOAA: buffer_store_dword v
 627 ; GCN-NOAA: buffer_store_dword v
 628 ; GCN-NOAA: buffer_store_dword v
 629 ; GCN-NOAA: buffer_store_dword v
 630 ; GCN-NOAA: buffer_store_dword v
 631
 632 ; GCN-AA: buffer_store_dwordx4
 633 ; GCN-AA: buffer_store_dwordx2
 634 ; GCN-AA: buffer_store_dwordx2
 635
 636 ; GCN: s_endpgm
 637 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 638   store i32 34, i32 addrspace(1)* %out, align 4
 639   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 640   store i32 999, i32 addrspace(1)* %idx1, align 4
 641   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 642   store i32 65, i32 addrspace(1)* %idx2, align 4
 643   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 644   store i32 33, i32 addrspace(1)* %idx3, align 4
 645   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 646   store i32 98, i32 addrspace(1)* %idx4, align 4
 647   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 648   store i32 91, i32 addrspace(1)* %idx5, align 4
 649   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 650   store i32 212, i32 addrspace(1)* %idx6, align 4
 651   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 652   store i32 999, i32 addrspace(1)* %idx7, align 4
 653   ret void
 654 }
 655
 656 declare void @llvm.AMDGPU.barrier.local() #1
 657
 658 attributes #0 = { nounwind }
 659 attributes #1 = { noduplicate nounwind }