test/CodeGen/AMDGPU/merge-stores.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
   2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
   3
   4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   6
   7 ; Run with devices with different unaligned load restrictions.
   8
   9 ; TODO: Vector element tests
  10 ; TODO: Non-zero base offset for load and store combinations
  11 ; TODO: Same base addrspacecasted
  12
  13
  14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
  15 ; GCN: buffer_store_byte
  16 ; GCN: buffer_store_byte
  17 ; GCN: s_endpgm
  18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  19   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  20
  21   store i8 123, i8 addrspace(1)* %out.gep.1
  22   store i8 456, i8 addrspace(1)* %out, align 2
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
  27 ; GCN: buffer_store_byte
  28 ; GCN: buffer_store_byte
  29 ; GCN: s_endpgm
  30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  31   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  32
  33   store i8 123, i8 addrspace(1)* %out.gep.1
  34   store i8 456, i8 addrspace(1)* %out
  35   ret void
  36 }
  37
  38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
  39 ; GCN: buffer_store_dword v
  40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  41   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  42
  43   store i16 123, i16 addrspace(1)* %out.gep.1
  44   store i16 456, i16 addrspace(1)* %out, align 4
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
  49 ; GCN: buffer_store_dword v
  50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  51   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  52
  53   store i16 0, i16 addrspace(1)* %out.gep.1
  54   store i16 0, i16 addrspace(1)* %out, align 4
  55   ret void
  56 }
  57
  58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
  59 ; GCN: buffer_store_short
  60 ; GCN: buffer_store_short
  61 ; GCN: s_endpgm
  62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  63   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  64
  65   store i16 123, i16 addrspace(1)* %out.gep.1
  66   store i16 456, i16 addrspace(1)* %out
  67   ret void
  68 }
  69
  70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
  71 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
  72 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
  73 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  74 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  75   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  76
  77   store i32 123, i32 addrspace(1)* %out.gep.1
  78   store i32 456, i32 addrspace(1)* %out
  79   ret void
  80 }
  81
  82 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
  83 ; GCN: buffer_store_dwordx2
  84 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  85   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  86   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  87   store float 1.0, float addrspace(1)* %out.gep.1.bc
  88   store i32 456, i32 addrspace(1)* %out
  89   ret void
  90 }
  91
  92 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
  93 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
  94 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
  95 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
  96 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
  97   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  98   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
  99   store i32 123, i32 addrspace(1)* %out.gep.1.bc
 100   store float 4.0, float addrspace(1)* %out
 101   ret void
 102 }
 103
 104 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
 105 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
 106 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
 107 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 108 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 109 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
 110 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 111   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 112   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 113   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 114
 115   store i32 123, i32 addrspace(1)* %out.gep.1
 116   store i32 456, i32 addrspace(1)* %out.gep.2
 117   store i32 333, i32 addrspace(1)* %out.gep.3
 118   store i32 1234, i32 addrspace(1)* %out
 119   ret void
 120 }
 121
 122 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 123 ; GCN: buffer_store_dwordx4
 124 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 125   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 126   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 127   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 128
 129   store float 8.0, float addrspace(1)* %out
 130   store float 1.0, float addrspace(1)* %out.gep.1
 131   store float 2.0, float addrspace(1)* %out.gep.2
 132   store float 4.0, float addrspace(1)* %out.gep.3
 133   ret void
 134 }
 135
 136 ; First store is out of order.
 137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 138 ; GCN: buffer_store_dwordx4
 139 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 140   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 141   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 142   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 143
 144   store float 1.0, float addrspace(1)* %out.gep.1
 145   store float 2.0, float addrspace(1)* %out.gep.2
 146   store float 4.0, float addrspace(1)* %out.gep.3
 147   store float 8.0, float addrspace(1)* %out
 148   ret void
 149 }
 150
 151 ; FIXME: Should be able to merge this
 152 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 153 ; GCN-NOAA: buffer_store_dword v
 154 ; GCN-NOAA: buffer_store_dword v
 155 ; GCN-NOAA: buffer_store_dword v
 156 ; GCN-NOAA: buffer_store_dword v
 157
 158 ; GCN-AA: buffer_store_dwordx2
 159 ; GCN-AA: buffer_store_dword v
 160 ; GCN-AA: buffer_store_dword v
 161
 162 ; GCN: s_endpgm
 163 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 164   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 165   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 166   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 167
 168   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 169   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 170
 171   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 172   store float 2.0, float addrspace(1)* %out.gep.2
 173   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 174   store float 8.0, float addrspace(1)* %out
 175   ret void
 176 }
 177
 178 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 179 ; SI-DAG: buffer_store_dwordx2
 180 ; SI-DAG: buffer_store_dword
 181 ; SI-NOT: buffer_store_dword
 182 ; GCN: s_endpgm
 183 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 184   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 185   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 186
 187   store i32 123, i32 addrspace(1)* %out.gep.1
 188   store i32 456, i32 addrspace(1)* %out.gep.2
 189   store i32 1234, i32 addrspace(1)* %out
 190   ret void
 191 }
 192
 193 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 194 ; XGCN: buffer_store_dwordx4
 195 ; GCN: buffer_store_dwordx2
 196 ; GCN: buffer_store_dwordx2
 197 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 198   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 199
 200   store i64 123, i64 addrspace(1)* %out.gep.1
 201   store i64 456, i64 addrspace(1)* %out
 202   ret void
 203 }
 204
 205 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 206 ; XGCN: buffer_store_dwordx4
 207 ; XGCN: buffer_store_dwordx4
 208
 209 ; GCN: buffer_store_dwordx2
 210 ; GCN: buffer_store_dwordx2
 211 ; GCN: buffer_store_dwordx2
 212 ; GCN: buffer_store_dwordx2
 213 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 214   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 215   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 216   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 217
 218   store i64 123, i64 addrspace(1)* %out.gep.1
 219   store i64 456, i64 addrspace(1)* %out.gep.2
 220   store i64 333, i64 addrspace(1)* %out.gep.3
 221   store i64 1234, i64 addrspace(1)* %out
 222   ret void
 223 }
 224
 225 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 226 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 227 ; GCN: buffer_store_dwordx2 [[LOAD]]
 228 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 229   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 230   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 231
 232   %lo = load i32, i32 addrspace(1)* %in
 233   %hi = load i32, i32 addrspace(1)* %in.gep.1
 234
 235   store i32 %lo, i32 addrspace(1)* %out
 236   store i32 %hi, i32 addrspace(1)* %out.gep.1
 237   ret void
 238 }
 239
 240 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 241 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 242 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 243 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 244   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 245   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 246
 247   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 248   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 249   %lo = load i32, i32 addrspace(1)* %in.gep.0
 250   %hi = load i32, i32 addrspace(1)* %in.gep.1
 251
 252   store i32 %lo, i32 addrspace(1)* %out.gep.0
 253   store i32 %hi, i32 addrspace(1)* %out.gep.1
 254   ret void
 255 }
 256
 257 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 258 ; GCN: buffer_load_dword v
 259 ; GCN: buffer_load_dword v
 260 ; GCN: buffer_store_dword v
 261 ; GCN: buffer_store_dword v
 262 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 263   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 264   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 265
 266   %lo = load i32, i32 addrspace(1)* %in
 267   %hi = load i32, i32 addrspace(1)* %in.gep.1
 268
 269   store i32 %hi, i32 addrspace(1)* %out
 270   store i32 %lo, i32 addrspace(1)* %out.gep.1
 271   ret void
 272 }
 273
 274 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 275 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 276 ; GCN: buffer_store_dwordx4 [[LOAD]]
 277 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 278   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 279   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 280   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 281   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 282   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 283   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 284
 285   %x = load i32, i32 addrspace(1)* %in
 286   %y = load i32, i32 addrspace(1)* %in.gep.1
 287   %z = load i32, i32 addrspace(1)* %in.gep.2
 288   %w = load i32, i32 addrspace(1)* %in.gep.3
 289
 290   store i32 %x, i32 addrspace(1)* %out
 291   store i32 %y, i32 addrspace(1)* %out.gep.1
 292   store i32 %z, i32 addrspace(1)* %out.gep.2
 293   store i32 %w, i32 addrspace(1)* %out.gep.3
 294   ret void
 295 }
 296
 297 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
 298 ; SI-DAG: buffer_load_dwordx2
 299 ; SI-DAG: buffer_load_dword v
 300 ; GCN: s_waitcnt
 301 ; SI-DAG: buffer_store_dword v
 302 ; SI-DAG: buffer_store_dwordx2 v
 303 ; GCN: s_endpgm
 304 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 305   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 306   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 307   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 308   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 309
 310   %x = load i32, i32 addrspace(1)* %in
 311   %y = load i32, i32 addrspace(1)* %in.gep.1
 312   %z = load i32, i32 addrspace(1)* %in.gep.2
 313
 314   store i32 %x, i32 addrspace(1)* %out
 315   store i32 %y, i32 addrspace(1)* %out.gep.1
 316   store i32 %z, i32 addrspace(1)* %out.gep.2
 317   ret void
 318 }
 319
 320 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 321 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 322 ; GCN: buffer_store_dwordx4 [[LOAD]]
 323 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 324   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 325   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 326   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 327   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 328   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 329   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 330
 331   %x = load float, float addrspace(1)* %in
 332   %y = load float, float addrspace(1)* %in.gep.1
 333   %z = load float, float addrspace(1)* %in.gep.2
 334   %w = load float, float addrspace(1)* %in.gep.3
 335
 336   store float %x, float addrspace(1)* %out
 337   store float %y, float addrspace(1)* %out.gep.1
 338   store float %z, float addrspace(1)* %out.gep.2
 339   store float %w, float addrspace(1)* %out.gep.3
 340   ret void
 341 }
 342
 343 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 344 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 345 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 346 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 347   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 348   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 349   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 350   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 351   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 352   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 353   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 354   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 355
 356   %x = load i32, i32 addrspace(1)* %in.gep.0
 357   %y = load i32, i32 addrspace(1)* %in.gep.1
 358   %z = load i32, i32 addrspace(1)* %in.gep.2
 359   %w = load i32, i32 addrspace(1)* %in.gep.3
 360
 361   store i32 %x, i32 addrspace(1)* %out.gep.0
 362   store i32 %y, i32 addrspace(1)* %out.gep.1
 363   store i32 %z, i32 addrspace(1)* %out.gep.2
 364   store i32 %w, i32 addrspace(1)* %out.gep.3
 365   ret void
 366 }
 367
 368 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
 369 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 370 ; GCN: s_barrier
 371 ; GCN: buffer_store_dwordx4 [[LOAD]]
 372 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 373   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 374   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 375   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 376   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 377   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 378   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 379
 380   %x = load i32, i32 addrspace(1)* %in
 381   %y = load i32, i32 addrspace(1)* %in.gep.1
 382   %z = load i32, i32 addrspace(1)* %in.gep.2
 383   %w = load i32, i32 addrspace(1)* %in.gep.3
 384
 385   ; Make sure the barrier doesn't stop this
 386   tail call void @llvm.AMDGPU.barrier.local() #1
 387
 388   store i32 %w, i32 addrspace(1)* %out.gep.3
 389   store i32 %z, i32 addrspace(1)* %out.gep.2
 390   store i32 %y, i32 addrspace(1)* %out.gep.1
 391   store i32 %x, i32 addrspace(1)* %out
 392
 393   ret void
 394 }
 395
 396 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 397 ; should catch this?
 398
 399 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
 400 ; GCN: buffer_load_dword v
 401 ; GCN: buffer_load_dword v
 402 ; GCN: buffer_load_dword v
 403 ; GCN: buffer_load_dword v
 404 ; GCN: s_barrier
 405 ; GCN: buffer_store_dword v
 406 ; GCN: buffer_store_dword v
 407 ; GCN: buffer_store_dword v
 408 ; GCN: buffer_store_dword v
 409 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 410   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 411   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 412   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 413   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 414   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 415   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 416
 417   %x = load i32, i32 addrspace(1)* %in
 418   %y = load i32, i32 addrspace(1)* %in.gep.1
 419   %z = load i32, i32 addrspace(1)* %in.gep.2
 420   %w = load i32, i32 addrspace(1)* %in.gep.3
 421
 422   ; Make sure the barrier doesn't stop this
 423   tail call void @llvm.AMDGPU.barrier.local() #1
 424
 425   store i32 %w, i32 addrspace(1)* %out
 426   store i32 %z, i32 addrspace(1)* %out.gep.1
 427   store i32 %y, i32 addrspace(1)* %out.gep.2
 428   store i32 %x, i32 addrspace(1)* %out.gep.3
 429
 430   ret void
 431 }
 432
 433 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
 434 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 435 ; GCN: buffer_store_dword [[LOAD]]
 436 ; GCN: s_endpgm
 437 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 438   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 439   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 440   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 441   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 442   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 443   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 444
 445   %x = load i8, i8 addrspace(1)* %in, align 4
 446   %y = load i8, i8 addrspace(1)* %in.gep.1
 447   %z = load i8, i8 addrspace(1)* %in.gep.2
 448   %w = load i8, i8 addrspace(1)* %in.gep.3
 449
 450   store i8 %x, i8 addrspace(1)* %out, align 4
 451   store i8 %y, i8 addrspace(1)* %out.gep.1
 452   store i8 %z, i8 addrspace(1)* %out.gep.2
 453   store i8 %w, i8 addrspace(1)* %out.gep.3
 454   ret void
 455 }
 456
 457 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
 458 ; GCN: buffer_load_ubyte
 459 ; GCN: buffer_load_ubyte
 460 ; GCN: buffer_load_ubyte
 461 ; GCN: buffer_load_ubyte
 462 ; GCN: buffer_store_byte
 463 ; GCN: buffer_store_byte
 464 ; GCN: buffer_store_byte
 465 ; GCN: buffer_store_byte
 466 ; GCN: s_endpgm
 467 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 468   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 469   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 470   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 471   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 472   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 473   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 474
 475   %x = load i8, i8 addrspace(1)* %in
 476   %y = load i8, i8 addrspace(1)* %in.gep.1
 477   %z = load i8, i8 addrspace(1)* %in.gep.2
 478   %w = load i8, i8 addrspace(1)* %in.gep.3
 479
 480   store i8 %x, i8 addrspace(1)* %out
 481   store i8 %y, i8 addrspace(1)* %out.gep.1
 482   store i8 %z, i8 addrspace(1)* %out.gep.2
 483   store i8 %w, i8 addrspace(1)* %out.gep.3
 484   ret void
 485 }
 486
 487 ; This works once AA is enabled on the subtarget
 488 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 489 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 490
 491 ; GCN-NOAA: buffer_store_dword v
 492 ; GCN-NOAA: buffer_store_dword v
 493 ; GCN-NOAA: buffer_store_dword v
 494 ; GCN-NOAA: buffer_store_dword v
 495
 496 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
 497
 498 ; GCN: s_endpgm
 499 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 500   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 501   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 502   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 503   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 504
 505   %x = extractelement <4 x i32> %vec, i32 0
 506   %y = extractelement <4 x i32> %vec, i32 1
 507   %z = extractelement <4 x i32> %vec, i32 2
 508   %w = extractelement <4 x i32> %vec, i32 3
 509
 510   store i32 %x, i32 addrspace(1)* %out
 511   store i32 %y, i32 addrspace(1)* %out.gep.1
 512   store i32 %z, i32 addrspace(1)* %out.gep.2
 513   store i32 %w, i32 addrspace(1)* %out.gep.3
 514   ret void
 515 }
 516
 517 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 518 ; GCN: ds_write_b8
 519 ; GCN: ds_write_b8
 520 ; GCN: s_endpgm
 521 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 522   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 523
 524   store i8 123, i8 addrspace(3)* %out.gep.1
 525   store i8 456, i8 addrspace(3)* %out, align 2
 526   ret void
 527 }
 528
 529 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
 530 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 531 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 532 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 533 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 534   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 535
 536   store i32 123, i32 addrspace(3)* %out.gep.1
 537   store i32 456, i32 addrspace(3)* %out
 538   ret void
 539 }
 540
 541 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
 542 ; GCN: ds_write_b32
 543 ; GCN: ds_write_b32
 544 ; GCN: ds_write_b32
 545 ; GCN: ds_write_b32
 546 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 547   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 548   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 549   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 550
 551   store i32 123, i32 addrspace(3)* %out.gep.1
 552   store i32 456, i32 addrspace(3)* %out.gep.2
 553   store i32 333, i32 addrspace(3)* %out.gep.3
 554   store i32 1234, i32 addrspace(3)* %out
 555   ret void
 556 }
 557
 558 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 559 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 560 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
 561 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 562 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 563 ; GCN: buffer_store_dword v[[HI]]
 564 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 565   store i32 9, i32 addrspace(1)* %out, align 4
 566   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 567   store i32 12, i32 addrspace(1)* %idx1, align 4
 568   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 569   store i32 16, i32 addrspace(1)* %idx2, align 4
 570   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 571   store i32 -12, i32 addrspace(1)* %idx3, align 4
 572   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 573   store i32 11, i32 addrspace(1)* %idx4, align 4
 574   ret void
 575 }
 576
 577 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 578 ; GCN: buffer_store_dwordx4
 579 ; GCN: buffer_store_dwordx2
 580 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 581   store i32 13, i32 addrspace(1)* %out, align 4
 582   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 583   store i32 15, i32 addrspace(1)* %idx1, align 4
 584   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 585   store i32 62, i32 addrspace(1)* %idx2, align 4
 586   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 587   store i32 63, i32 addrspace(1)* %idx3, align 4
 588   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 589   store i32 11, i32 addrspace(1)* %idx4, align 4
 590   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 591   store i32 123, i32 addrspace(1)* %idx5, align 4
 592   ret void
 593 }
 594
 595 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 596 ; GCN: buffer_store_dwordx4
 597 ; GCN: buffer_store_dwordx2
 598 ; GCN: buffer_store_dword v
 599 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 600   store i32 34, i32 addrspace(1)* %out, align 4
 601   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 602   store i32 999, i32 addrspace(1)* %idx1, align 4
 603   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 604   store i32 65, i32 addrspace(1)* %idx2, align 4
 605   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 606   store i32 33, i32 addrspace(1)* %idx3, align 4
 607   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 608   store i32 98, i32 addrspace(1)* %idx4, align 4
 609   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 610   store i32 91, i32 addrspace(1)* %idx5, align 4
 611   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 612   store i32 212, i32 addrspace(1)* %idx6, align 4
 613   ret void
 614 }
 615
 616 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
 617 ; GCN: buffer_store_dwordx4
 618 ; GCN: buffer_store_dwordx4
 619 ; GCN: s_endpgm
 620 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 621   store i32 34, i32 addrspace(1)* %out, align 4
 622   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 623   store i32 999, i32 addrspace(1)* %idx1, align 4
 624   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 625   store i32 65, i32 addrspace(1)* %idx2, align 4
 626   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 627   store i32 33, i32 addrspace(1)* %idx3, align 4
 628   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 629   store i32 98, i32 addrspace(1)* %idx4, align 4
 630   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 631   store i32 91, i32 addrspace(1)* %idx5, align 4
 632   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 633   store i32 212, i32 addrspace(1)* %idx6, align 4
 634   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 635   store i32 999, i32 addrspace(1)* %idx7, align 4
 636   ret void
 637 }
 638
 639 declare void @llvm.AMDGPU.barrier.local() #1
 640
 641 attributes #0 = { nounwind }
 642 attributes #1 = { noduplicate nounwind }