test/CodeGen/AMDGPU/insert_vector_elt.ll

   1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
   2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
   3
   4 ; FIXME: Broken on evergreen
   5 ; FIXME: For some reason the 8 and 16 vectors are being stored as
   6 ; individual elements instead of 128-bit stores.
   7
   8
   9 ; FIXME: Why is the constant moved into the intermediate register and
  10 ; not just directly into the vector component?
  11
  12 ; SI-LABEL: {{^}}insertelement_v4f32_0:
  13 ; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
  14 ; v_mov_b32_e32
  15 ; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
  16 ; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
  17 ; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
  18 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
  19   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
  20   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
  21   ret void
  22 }
  23
  24 ; SI-LABEL: {{^}}insertelement_v4f32_1:
  25 define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
  26   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
  27   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
  28   ret void
  29 }
  30
  31 ; SI-LABEL: {{^}}insertelement_v4f32_2:
  32 define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
  33   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
  34   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
  35   ret void
  36 }
  37
  38 ; SI-LABEL: {{^}}insertelement_v4f32_3:
  39 define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
  40   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
  41   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
  42   ret void
  43 }
  44
  45 ; SI-LABEL: {{^}}insertelement_v4i32_0:
  46 define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
  47   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
  48   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
  49   ret void
  50 }
  51
  52 ; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
  53 ; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
  54 ; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
  55 ; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
  56 define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
  57   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
  58   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
  59   ret void
  60 }
  61
  62 ; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
  63 ; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
  64 ; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
  65 ; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
  66 define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
  67   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
  68   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
  69   ret void
  70 }
  71
  72 ; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
  73 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
  74 ; SI: buffer_store_dwordx4
  75 ; SI: buffer_store_dwordx4
  76 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
  77   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
  78   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
  79   ret void
  80 }
  81
  82 ; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
  83 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
  84 ; SI: buffer_store_dwordx4
  85 ; SI: buffer_store_dwordx4
  86 ; SI: buffer_store_dwordx4
  87 ; SI: buffer_store_dwordx4
  88 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
  89   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
  90   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
  91   ret void
  92 }
  93
  94 ; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
  95 ; SI: buffer_store_dwordx2
  96 define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
  97   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
  98   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
  99   ret void
 100 }
 101
 102 ; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
 103 ; SI: buffer_store_dwordx4
 104 define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
 105   %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
 106   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
 107   ret void
 108 }
 109
 110 ; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
 111 ; FIXMESI: buffer_store_dwordx4
 112 ; FIXMESI: buffer_store_dwordx4
 113 define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
 114   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
 115   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
 116   ret void
 117 }
 118
 119 ; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
 120 ; FIXMESI: buffer_store_dwordx4
 121 ; FIXMESI: buffer_store_dwordx4
 122 ; FIXMESI: buffer_store_dwordx4
 123 ; FIXMESI: buffer_store_dwordx4
 124 define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
 125   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
 126   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
 127   ret void
 128 }
 129
 130
 131 ; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
 132 ; FIXMESI: buffer_store_dwordx2
 133 define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
 134   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
 135   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
 136   ret void
 137 }
 138
 139 ; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
 140 ; FIXMESI: buffer_store_dwordx4
 141 define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
 142   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
 143   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
 144   ret void
 145 }
 146
 147
 148 ; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
 149 ; FIXMESI: BUFFER_STORE_USHORT
 150 define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
 151   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
 152   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
 153   ret void
 154 }
 155
 156 ; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
 157 ; FIXMESI: buffer_store_dword
 158 define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
 159   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
 160   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
 161   ret void
 162 }
 163
 164 ; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
 165 ; FIXMESI: buffer_store_dwordx2
 166 define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
 167   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
 168   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
 169   ret void
 170 }
 171
 172 ; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
 173 ; FIXMESI: buffer_store_dwordx4
 174 define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
 175   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
 176   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
 177   ret void
 178 }
 179
 180 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 181 ; the compiler doesn't crash.
 182 ; SI-LABEL: {{^}}insert_split_bb:
 183 define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 184 entry:
 185   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
 186   %1 = icmp eq i32 %a, 0
 187   br i1 %1, label %if, label %else
 188
 189 if:
 190   %2 = load i32, i32 addrspace(1)* %in
 191   %3 = insertelement <2 x i32> %0, i32 %2, i32 1
 192   br label %endif
 193
 194 else:
 195   %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 196   %5 = load i32, i32 addrspace(1)* %4
 197   %6 = insertelement <2 x i32> %0, i32 %5, i32 1
 198   br label %endif
 199
 200 endif:
 201   %7 = phi <2 x i32> [%3, %if], [%6, %else]
 202   store <2 x i32> %7, <2 x i32> addrspace(1)* %out
 203   ret void
 204 }
 205
 206 ; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
 207 ; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
 208 ; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
 209 ; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
 210
 211 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 212 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 213 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 214 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 215
 216 ; SI: s_mov_b32 m0, [[SCALEDIDX]]
 217 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
 218
 219 ; Increment to next element.
 220 ; FIXME: Should be able to manipulate m0 directly instead of add and
 221 ; copy.
 222
 223 ; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
 224 ; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
 225 ; SI-DAG: s_mov_b32 m0, [[IDX1]]
 226 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
 227
 228 ; SI: buffer_store_dwordx4
 229 ; SI: s_endpgm
 230 define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
 231   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
 232   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
 233   ret void
 234 }
 235
 236 ; FIXME: Inline immediate should be folded into v_movreld_b32.
 237 ; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
 238
 239 ; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
 240 ; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
 241
 242 ; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
 243 ; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
 244
 245 ; SI: buffer_store_dwordx4
 246 ; SI: s_endpgm
 247 define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
 248   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
 249   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
 250   ret void
 251 }
 252
 253 ; FIXME: Should be able to do without stack access. The used stack
 254 ; space is also 2x what should be required.
 255
 256 ; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
 257 ; SI: SCRATCH_RSRC_DWORD
 258
 259 ; Stack store
 260 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 261 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
 262
 263 ; Write element
 264 ; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 265
 266 ; Stack reload
 267 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
 268 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 269
 270 ; Store result
 271 ; SI: buffer_store_dwordx4
 272 ; SI: buffer_store_dwordx4
 273 ; SI: s_endpgm
 274 ; SI: ScratchSize: 64
 275
 276 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
 277   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
 278   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
 279   ret void
 280 }
 281
 282 ; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
 283 ; SI: SCRATCH_RSRC_DWORD
 284
 285 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 286 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
 287 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
 288 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
 289
 290 ; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 291
 292 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
 293 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 294 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
 295 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 296
 297 ; SI: buffer_store_dwordx4
 298 ; SI: buffer_store_dwordx4
 299 ; SI: buffer_store_dwordx4
 300 ; SI: buffer_store_dwordx4
 301 ; SI: s_endpgm
 302 ; SI: ScratchSize: 128
 303 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
 304   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
 305   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
 306   ret void
 307 }