1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
5 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
6 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10 target triple = "x86_64-unknown-linux-gnu"
14 ; SCALAR: extractelement <16 x float*>
15 ; SCALAR-NEXT: load float
16 ; SCALAR-NEXT: insertelement <16 x float>
17 ; SCALAR-NEXT: extractelement <16 x float*>
18 ; SCALAR-NEXT: load float
20 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
21 ; KNL_64-LABEL: test1:
23 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
24 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
25 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
28 ; KNL_32-LABEL: test1:
30 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
31 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
32 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
33 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
38 ; SKX-NEXT: kxnorw %k0, %k0, %k1
39 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
40 ; SKX-NEXT: vmovaps %zmm1, %zmm0
43 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
44 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
46 %sext_ind = sext <16 x i32> %ind to <16 x i64>
47 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
49 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
53 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
54 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
55 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
59 ; SCALAR: extractelement <16 x float*>
60 ; SCALAR-NEXT: load float
61 ; SCALAR-NEXT: insertelement <16 x float>
62 ; SCALAR-NEXT: br label %else
64 ; SCALAR-NEXT: %res.phi.else = phi
65 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
66 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
67 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
69 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
70 ; KNL_64-LABEL: test2:
72 ; KNL_64-NEXT: kmovw %esi, %k1
73 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
74 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
77 ; KNL_32-LABEL: test2:
79 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
80 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
81 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
82 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
87 ; SKX-NEXT: kmovw %esi, %k1
88 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
89 ; SKX-NEXT: vmovaps %zmm1, %zmm0
92 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
93 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
95 %sext_ind = sext <16 x i32> %ind to <16 x i64>
96 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
97 %imask = bitcast i16 %mask to <16 x i1>
98 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
102 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
103 ; KNL_64-LABEL: test3:
105 ; KNL_64-NEXT: kmovw %esi, %k1
106 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
107 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
110 ; KNL_32-LABEL: test3:
112 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
113 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
114 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
115 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
120 ; SKX-NEXT: kmovw %esi, %k1
121 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
122 ; SKX-NEXT: vmovaps %zmm1, %zmm0
125 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
126 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
128 %sext_ind = sext <16 x i32> %ind to <16 x i64>
129 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
130 %imask = bitcast i16 %mask to <16 x i1>
131 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
136 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
137 ; KNL_64-LABEL: test4:
139 ; KNL_64-NEXT: kmovw %esi, %k1
140 ; KNL_64-NEXT: kmovw %k1, %k2
141 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
142 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2
143 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
144 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
147 ; KNL_32-LABEL: test4:
149 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
150 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
151 ; KNL_32-NEXT: kmovw %k1, %k2
152 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
153 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
154 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
155 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
160 ; SKX-NEXT: kmovw %esi, %k1
161 ; SKX-NEXT: kmovw %k1, %k2
162 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
163 ; SKX-NEXT: vmovaps %zmm1, %zmm2
164 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
165 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
168 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
169 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
171 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
172 %imask = bitcast i16 %mask to <16 x i1>
173 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
174 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
175 %res = add <16 x i32> %gt1, %gt2
180 ; SCALAR-LABEL: test5
181 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
182 ; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true
183 ; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else
184 ; SCALAR: cond.store:
185 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
186 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
187 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
188 ; SCALAR-NEXT: br label %else
190 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
191 ; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true
192 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
194 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
195 ; KNL_64-LABEL: test5:
197 ; KNL_64-NEXT: kmovw %esi, %k1
198 ; KNL_64-NEXT: kmovw %k1, %k2
199 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
200 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
203 ; KNL_32-LABEL: test5:
205 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
206 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
207 ; KNL_32-NEXT: kmovw %k1, %k2
208 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
209 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
214 ; SKX-NEXT: kmovw %esi, %k1
215 ; SKX-NEXT: kmovw %k1, %k2
216 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
217 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
220 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
221 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
223 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
224 %imask = bitcast i16 %mask to <16 x i1>
225 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
226 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
230 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
231 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
234 ; SCALAR-LABEL: test6
235 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
236 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
237 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
238 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
239 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
240 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
241 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
243 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
244 ; KNL_64-LABEL: test6:
246 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
247 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
248 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
249 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
250 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
253 ; KNL_32-LABEL: test6:
255 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
256 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
257 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2
258 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
259 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
260 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
265 ; SKX-NEXT: kxnorw %k0, %k0, %k1
266 ; SKX-NEXT: kxnorw %k0, %k0, %k2
267 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
268 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
269 ; SKX-NEXT: vmovaps %zmm2, %zmm0
272 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
274 call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
278 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
280 ; KNL_64-LABEL: test7:
282 ; KNL_64-NEXT: movzbl %sil, %eax
283 ; KNL_64-NEXT: kmovw %eax, %k1
284 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
285 ; KNL_64-NEXT: kmovw %k1, %k2
286 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
287 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2
288 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
289 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
292 ; KNL_32-LABEL: test7:
294 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
295 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
296 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
297 ; KNL_32-NEXT: kmovw %k1, %k2
298 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
299 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
300 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
301 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
306 ; SKX-NEXT: kmovb %esi, %k1
307 ; SKX-NEXT: kmovw %k1, %k2
308 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
309 ; SKX-NEXT: vmovaps %zmm1, %zmm2
310 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
311 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
314 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
315 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
317 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
318 %imask = bitcast i8 %mask to <8 x i1>
319 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
320 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
321 %res = add <8 x i32> %gt1, %gt2
325 ; No uniform base in this case, index <8 x i64> contains addresses,
326 ; each gather call will be split into two
327 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
328 ; KNL_64-LABEL: test8:
330 ; KNL_64-NEXT: kmovw %edi, %k1
331 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
332 ; KNL_64-NEXT: kmovw %k2, %k3
333 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
334 ; KNL_64-NEXT: kmovw %k1, %k3
335 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
336 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
337 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
338 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
339 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
340 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
343 ; KNL_32-LABEL: test8:
345 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
346 ; KNL_32-NEXT: kmovw %k1, %k2
347 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
348 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
349 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
350 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
355 ; SKX-NEXT: kmovw %edi, %k1
356 ; SKX-NEXT: kshiftrw $8, %k1, %k2
357 ; SKX-NEXT: kmovw %k2, %k3
358 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
359 ; SKX-NEXT: kmovw %k1, %k3
360 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
361 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
362 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
363 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
364 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
365 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
368 ; SKX_32-LABEL: test8:
370 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
371 ; SKX_32-NEXT: kmovw %k1, %k2
372 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
373 ; SKX_32-NEXT: vmovaps %zmm1, %zmm2
374 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
375 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
378 %imask = bitcast i16 %mask to <16 x i1>
379 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
380 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
381 %res = add <16 x i32> %gt1, %gt2
385 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
386 %struct.ST = type { i32, double, %struct.RT }
388 ; Masked gather for agregate types
389 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
392 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
393 ; KNL_64-LABEL: test9:
394 ; KNL_64: # BB#0: # %entry
395 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
396 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
397 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
398 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
399 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
400 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
401 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
402 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
403 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
404 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
405 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
406 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
407 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
408 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
409 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
410 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
411 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
412 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
413 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
416 ; KNL_32-LABEL: test9:
417 ; KNL_32: # BB#0: # %entry
418 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
419 ; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3
420 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
421 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
422 ; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
423 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
424 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
425 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
426 ; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
427 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
428 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
429 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
430 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
434 ; SKX: # BB#0: # %entry
435 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
436 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
437 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
438 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
439 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
440 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
441 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
442 ; SKX-NEXT: kxnorw %k0, %k0, %k1
443 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
446 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
447 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
449 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
450 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
454 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
455 ; KNL_64-LABEL: test10:
456 ; KNL_64: # BB#0: # %entry
457 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
458 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
459 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
460 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
461 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
462 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
463 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
464 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
465 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
466 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
467 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
468 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
469 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
470 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
471 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
472 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
474 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
475 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
478 ; KNL_32-LABEL: test10:
479 ; KNL_32: # BB#0: # %entry
480 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
481 ; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3
482 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
483 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
484 ; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
485 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
486 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
487 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
488 ; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
489 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
490 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
491 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
492 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
496 ; SKX: # BB#0: # %entry
497 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
498 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
499 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
500 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
501 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
502 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
503 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
504 ; SKX-NEXT: kxnorw %k0, %k0, %k1
505 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
508 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
509 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
511 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
512 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
516 ; Splat index in GEP, requires broadcast
517 define <16 x float> @test11(float* %base, i32 %ind) {
518 ; KNL_64-LABEL: test11:
520 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
521 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
522 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
525 ; KNL_32-LABEL: test11:
527 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
528 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1
529 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
530 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
535 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
536 ; SKX-NEXT: kxnorw %k0, %k0, %k1
537 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
540 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
541 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
543 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
545 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
549 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
550 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
551 ; KNL_64-LABEL: test12:
553 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
554 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
555 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
558 ; KNL_32-LABEL: test12:
560 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
561 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
562 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
563 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
568 ; SKX-NEXT: kxnorw %k0, %k0, %k1
569 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
570 ; SKX-NEXT: vmovaps %zmm1, %zmm0
573 %sext_ind = sext <16 x i32> %ind to <16 x i64>
574 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
580 ; The same as the previous, but the mask is undefined
581 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
582 ; KNL_64-LABEL: test13:
584 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
585 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
588 ; KNL_32-LABEL: test13:
590 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
591 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
592 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
597 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
598 ; SKX-NEXT: vmovaps %zmm1, %zmm0
601 %sext_ind = sext <16 x i32> %ind to <16 x i64>
602 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
604 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
608 ; The base pointer is not splat, can't find unform base
609 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
610 ; KNL_64-LABEL: test14:
612 ; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
613 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
614 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
615 ; KNL_64-NEXT: vmovd %esi, %xmm1
616 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
617 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
618 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
619 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
620 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
621 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
622 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
623 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
626 ; KNL_32-LABEL: test14:
628 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
629 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
630 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
631 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
632 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
633 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
638 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
639 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
640 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
641 ; SKX-NEXT: vmovd %esi, %xmm1
642 ; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
643 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
644 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
645 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
646 ; SKX-NEXT: kshiftrw $8, %k0, %k1
647 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
648 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
649 ; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
652 ; SKX_32-LABEL: test14:
654 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
655 ; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
656 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
657 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
658 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
659 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
662 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
663 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
665 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
667 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
671 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
672 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
673 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
675 ; Gather smaller than existing instruction
676 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
678 ; KNL_64-LABEL: test15:
680 ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
681 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
682 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
683 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
684 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
685 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
686 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
689 ; KNL_32-LABEL: test15:
691 ; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
692 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
693 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
694 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
695 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
696 ; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0
697 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
698 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
703 ; SKX-NEXT: vpmovd2m %xmm1, %k1
704 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
705 ; SKX-NEXT: vmovaps %zmm1, %zmm0
708 %sext_ind = sext <4 x i32> %ind to <4 x i64>
709 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
710 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
714 ; Gather smaller than existing instruction
715 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
717 ; KNL_64-LABEL: test16:
719 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
720 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
721 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
722 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
723 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
724 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
725 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
726 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
727 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
728 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
731 ; KNL_32-LABEL: test16:
733 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
734 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
735 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
736 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
737 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
738 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
739 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
740 ; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1
741 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
742 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
743 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
748 ; SKX-NEXT: vpmovd2m %xmm1, %k1
749 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
750 ; SKX-NEXT: vmovaps %zmm2, %zmm0
753 %sext_ind = sext <4 x i32> %ind to <4 x i64>
754 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
755 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
759 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
761 ; KNL_64-LABEL: test17:
763 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
764 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
765 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
766 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
767 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
768 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
771 ; KNL_32-LABEL: test17:
773 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
774 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
775 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
776 ; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1
777 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
778 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
779 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
784 ; SKX-NEXT: vpmovq2m %xmm1, %k1
785 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
786 ; SKX-NEXT: vmovaps %zmm2, %zmm0
789 %sext_ind = sext <2 x i32> %ind to <2 x i64>
790 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
791 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
795 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
796 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
797 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
798 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
799 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
801 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
803 ; KNL_64-LABEL: test18:
805 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
806 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
807 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
808 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
809 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
810 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
813 ; KNL_32-LABEL: test18:
815 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
816 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
817 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
818 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
819 ; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2
820 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
821 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
826 ; SKX-NEXT: vpmovd2m %xmm2, %k1
827 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
829 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
833 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
835 ; KNL_64-LABEL: test19:
837 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
838 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
839 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
840 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
841 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
842 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
843 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
844 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
847 ; KNL_32-LABEL: test19:
849 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
850 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
851 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
852 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
853 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
854 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
855 ; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1
856 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
857 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
862 ; SKX-NEXT: vpmovd2m %xmm1, %k1
863 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
866 ; SKX_32-LABEL: test19:
868 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
869 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
870 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
872 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
873 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
877 ; Data type requires widening
878 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
880 ; KNL_64-LABEL: test20:
882 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
883 ; KNL_64-NEXT: vmovq %xmm2, %xmm2
884 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
885 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
886 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
887 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
888 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
889 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
892 ; KNL_32-LABEL: test20:
894 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
895 ; KNL_32-NEXT: vmovq %xmm2, %xmm2
896 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
897 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
898 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
899 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
900 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
901 ; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2
902 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
903 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
908 ; SKX-NEXT: vpmovq2m %xmm2, %k0
909 ; SKX-NEXT: kshiftlw $2, %k0, %k0
910 ; SKX-NEXT: kshiftrw $2, %k0, %k1
911 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
913 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
917 ; Data type requires promotion
918 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
920 ; KNL_64-LABEL: test21:
922 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
923 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
924 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
925 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
926 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
927 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
930 ; KNL_32-LABEL: test21:
932 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
933 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
934 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
935 ; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2
936 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
937 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
942 ; SKX-NEXT: vpmovq2m %xmm2, %k0
943 ; SKX-NEXT: kshiftlw $2, %k0, %k0
944 ; SKX-NEXT: kshiftrw $2, %k0, %k1
945 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
946 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
948 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
952 ; The result type requires widening
953 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
955 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
958 ; KNL_64-LABEL: test22:
960 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
961 ; KNL_64-NEXT: vmovq %xmm1, %xmm1
962 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
963 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
964 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
965 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
966 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
967 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
968 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
969 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
970 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
973 ; KNL_32-LABEL: test22:
975 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
976 ; KNL_32-NEXT: vmovq %xmm1, %xmm1
977 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
978 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
979 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
980 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
981 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
982 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
983 ; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1
984 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
985 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
986 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
991 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
992 ; SKX-NEXT: vpmovq2m %xmm1, %k0
993 ; SKX-NEXT: kshiftlw $2, %k0, %k0
994 ; SKX-NEXT: kshiftrw $2, %k0, %k1
995 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
996 ; SKX-NEXT: vmovaps %zmm2, %zmm0
998 %sext_ind = sext <2 x i32> %ind to <2 x i64>
999 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1000 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1004 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1005 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1007 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1009 ; KNL_64-LABEL: test23:
1011 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1012 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1013 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
1014 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1015 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1016 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1019 ; KNL_32-LABEL: test23:
1021 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1022 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1023 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1024 ; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1
1025 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1026 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1027 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1030 ; SKX-LABEL: test23:
1032 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1033 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1034 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1036 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1037 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1038 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1042 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1045 ; KNL_64-LABEL: test24:
1047 ; KNL_64-NEXT: movb $3, %al
1048 ; KNL_64-NEXT: movzbl %al, %eax
1049 ; KNL_64-NEXT: kmovw %eax, %k1
1050 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1051 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1054 ; KNL_32-LABEL: test24:
1056 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1057 ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
1058 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
1059 ; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1
1060 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1061 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1062 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1065 ; SKX-LABEL: test24:
1067 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1068 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1069 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1071 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1072 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1073 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1077 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1079 ; KNL_64-LABEL: test25:
1081 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1082 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1083 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
1084 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1085 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1086 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1089 ; KNL_32-LABEL: test25:
1091 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1092 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1093 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1094 ; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1
1095 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1096 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1097 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1100 ; SKX-LABEL: test25:
1102 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1103 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1104 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1106 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1107 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1108 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1112 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1114 ; KNL_64-LABEL: test26:
1116 ; KNL_64-NEXT: movb $3, %al
1117 ; KNL_64-NEXT: movzbl %al, %eax
1118 ; KNL_64-NEXT: kmovw %eax, %k1
1119 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1120 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1123 ; KNL_32-LABEL: test26:
1125 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1126 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1127 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
1128 ; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2
1129 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1130 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1131 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1134 ; SKX-LABEL: test26:
1136 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1137 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1138 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1140 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1141 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1142 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1146 ; Result type requires widening; all-ones mask
1147 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1149 ; KNL_64-LABEL: test27:
1151 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1152 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
1153 ; KNL_64-NEXT: movb $3, %al
1154 ; KNL_64-NEXT: movzbl %al, %eax
1155 ; KNL_64-NEXT: kmovw %eax, %k1
1156 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
1159 ; KNL_32-LABEL: test27:
1161 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1162 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1163 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
1164 ; KNL_32-NEXT: movb $3, %cl
1165 ; KNL_32-NEXT: movzbl %cl, %ecx
1166 ; KNL_32-NEXT: kmovw %ecx, %k1
1167 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
1170 ; SKX-LABEL: test27:
1172 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1173 ; SKX-NEXT: movb $3, %al
1174 ; SKX-NEXT: kmovb %eax, %k1
1175 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1177 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1178 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1179 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1183 ; Data type requires promotion, mask is all-ones
1184 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1187 ; KNL_64-LABEL: test28:
1189 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1190 ; KNL_64-NEXT: movb $3, %al
1191 ; KNL_64-NEXT: movzbl %al, %eax
1192 ; KNL_64-NEXT: kmovw %eax, %k1
1193 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1196 ; KNL_32-LABEL: test28:
1198 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1199 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1200 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
1201 ; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2
1202 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1203 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1206 ; SKX-LABEL: test28:
1208 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1209 ; SKX-NEXT: movb $3, %al
1210 ; SKX-NEXT: kmovb %eax, %k1
1211 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1213 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1218 ; SCALAR-LABEL: test29
1219 ; SCALAR: extractelement <16 x float*>
1220 ; SCALAR-NEXT: load float
1221 ; SCALAR-NEXT: insertelement <16 x float>
1222 ; SCALAR-NEXT: extractelement <16 x float*>
1223 ; SCALAR-NEXT: load float
1225 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1226 ; KNL_64-LABEL: test29:
1228 ; KNL_64-NEXT: movw $44, %ax
1229 ; KNL_64-NEXT: kmovw %eax, %k1
1230 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1231 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1234 ; KNL_32-LABEL: test29:
1236 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1237 ; KNL_32-NEXT: movw $44, %cx
1238 ; KNL_32-NEXT: kmovw %ecx, %k1
1239 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1240 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1243 ; SKX-LABEL: test29:
1245 ; SKX-NEXT: movw $44, %ax
1246 ; SKX-NEXT: kmovw %eax, %k1
1247 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1248 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1251 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1252 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1254 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1255 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1257 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1258 ret <16 x float>%res
1261 ; Check non-power-of-2 case. It should be scalarized.
1262 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1263 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1264 ; KNL_64-LABEL: test30:
1266 ; KNL_64-NEXT: andl $1, %edx
1267 ; KNL_64-NEXT: kmovw %edx, %k1
1268 ; KNL_64-NEXT: andl $1, %esi
1269 ; KNL_64-NEXT: kmovw %esi, %k2
1270 ; KNL_64-NEXT: movl %edi, %eax
1271 ; KNL_64-NEXT: andl $1, %eax
1272 ; KNL_64-NEXT: kmovw %eax, %k0
1273 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1274 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1275 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1276 ; KNL_64-NEXT: # implicit-def: %XMM0
1277 ; KNL_64-NEXT: testb $1, %dil
1278 ; KNL_64-NEXT: je .LBB29_2
1279 ; KNL_64-NEXT: # BB#1: # %cond.load
1280 ; KNL_64-NEXT: vmovq %xmm1, %rax
1281 ; KNL_64-NEXT: vmovd (%rax), %xmm0
1282 ; KNL_64-NEXT: .LBB29_2: # %else
1283 ; KNL_64-NEXT: kmovw %k2, %eax
1284 ; KNL_64-NEXT: movl %eax, %ecx
1285 ; KNL_64-NEXT: andl $1, %ecx
1286 ; KNL_64-NEXT: testb %cl, %cl
1287 ; KNL_64-NEXT: je .LBB29_4
1288 ; KNL_64-NEXT: # BB#3: # %cond.load1
1289 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1290 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
1291 ; KNL_64-NEXT: .LBB29_4: # %else2
1292 ; KNL_64-NEXT: kmovw %k1, %ecx
1293 ; KNL_64-NEXT: movl %ecx, %edx
1294 ; KNL_64-NEXT: andl $1, %edx
1295 ; KNL_64-NEXT: testb %dl, %dl
1296 ; KNL_64-NEXT: je .LBB29_6
1297 ; KNL_64-NEXT: # BB#5: # %cond.load4
1298 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1299 ; KNL_64-NEXT: vmovq %xmm1, %rdx
1300 ; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
1301 ; KNL_64-NEXT: .LBB29_6: # %else5
1302 ; KNL_64-NEXT: kmovw %k0, %edx
1303 ; KNL_64-NEXT: vmovd %edx, %xmm1
1304 ; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1305 ; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1306 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1307 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1310 ; KNL_32-LABEL: test30:
1312 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1313 ; KNL_32-NEXT: andl $1, %eax
1314 ; KNL_32-NEXT: kmovw %eax, %k1
1315 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1316 ; KNL_32-NEXT: andl $1, %eax
1317 ; KNL_32-NEXT: kmovw %eax, %k2
1318 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1319 ; KNL_32-NEXT: movl %eax, %ecx
1320 ; KNL_32-NEXT: andl $1, %ecx
1321 ; KNL_32-NEXT: kmovw %ecx, %k0
1322 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1323 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1324 ; KNL_32-NEXT: # implicit-def: %XMM0
1325 ; KNL_32-NEXT: testb $1, %al
1326 ; KNL_32-NEXT: je .LBB29_2
1327 ; KNL_32-NEXT: # BB#1: # %cond.load
1328 ; KNL_32-NEXT: vmovd %xmm1, %eax
1329 ; KNL_32-NEXT: vmovd (%eax), %xmm0
1330 ; KNL_32-NEXT: .LBB29_2: # %else
1331 ; KNL_32-NEXT: kmovw %k2, %eax
1332 ; KNL_32-NEXT: movl %eax, %ecx
1333 ; KNL_32-NEXT: andl $1, %ecx
1334 ; KNL_32-NEXT: testb %cl, %cl
1335 ; KNL_32-NEXT: je .LBB29_4
1336 ; KNL_32-NEXT: # BB#3: # %cond.load1
1337 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1338 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
1339 ; KNL_32-NEXT: .LBB29_4: # %else2
1340 ; KNL_32-NEXT: kmovw %k1, %ecx
1341 ; KNL_32-NEXT: movl %ecx, %edx
1342 ; KNL_32-NEXT: andl $1, %edx
1343 ; KNL_32-NEXT: testb %dl, %dl
1344 ; KNL_32-NEXT: je .LBB29_6
1345 ; KNL_32-NEXT: # BB#5: # %cond.load4
1346 ; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
1347 ; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
1348 ; KNL_32-NEXT: .LBB29_6: # %else5
1349 ; KNL_32-NEXT: kmovw %k0, %edx
1350 ; KNL_32-NEXT: vmovd %edx, %xmm1
1351 ; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1352 ; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1353 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1354 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1357 ; SKX-LABEL: test30:
1359 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1360 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1361 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1362 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1363 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1364 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1365 ; SKX-NEXT: # implicit-def: %XMM0
1366 ; SKX-NEXT: andb $1, %al
1367 ; SKX-NEXT: je .LBB29_2
1368 ; SKX-NEXT: # BB#1: # %cond.load
1369 ; SKX-NEXT: vmovq %xmm1, %rax
1370 ; SKX-NEXT: vmovd (%rax), %xmm0
1371 ; SKX-NEXT: .LBB29_2: # %else
1372 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1373 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1374 ; SKX-NEXT: andb $1, %al
1375 ; SKX-NEXT: je .LBB29_4
1376 ; SKX-NEXT: # BB#3: # %cond.load1
1377 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1378 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
1379 ; SKX-NEXT: .LBB29_4: # %else2
1380 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1381 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1382 ; SKX-NEXT: andb $1, %al
1383 ; SKX-NEXT: je .LBB29_6
1384 ; SKX-NEXT: # BB#5: # %cond.load4
1385 ; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
1386 ; SKX-NEXT: vmovq %xmm1, %rax
1387 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
1388 ; SKX-NEXT: .LBB29_6: # %else5
1389 ; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
1390 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1393 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1394 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1395 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1399 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1404 define <16 x float*> @test31(<16 x float**> %ptrs) {
1405 ; KNL_64-LABEL: test31:
1407 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1408 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1409 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1410 ; KNL_64-NEXT: kshiftrw $8, %k1, %k1
1411 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1412 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1413 ; KNL_64-NEXT: vmovaps %zmm3, %zmm1
1416 ; KNL_32-LABEL: test31:
1418 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1419 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1420 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1423 ; SKX-LABEL: test31:
1425 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1426 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1427 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1428 ; SKX-NEXT: kshiftrw $8, %k1, %k1
1429 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1430 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1431 ; SKX-NEXT: vmovaps %zmm3, %zmm1
1434 ; SKX_32-LABEL: test31:
1436 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1437 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1438 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1441 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1442 ret <16 x float*>%res
1445 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1446 ; KNL_64-LABEL: test_gather_16i32:
1448 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1449 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1450 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1451 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1452 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1453 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1454 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1455 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1458 ; KNL_32-LABEL: test_gather_16i32:
1460 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1461 ; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
1462 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1463 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1464 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1467 ; SKX-LABEL: test_gather_16i32:
1469 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1470 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1471 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1472 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
1473 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1474 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1475 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1476 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
1479 ; SKX_32-LABEL: test_gather_16i32:
1481 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1482 ; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
1483 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1484 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1485 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1487 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1490 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1491 ; KNL_64-LABEL: test_gather_16i64:
1493 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1494 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1495 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1496 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1497 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1498 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1499 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0
1500 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1
1503 ; KNL_32-LABEL: test_gather_16i64:
1505 ; KNL_32-NEXT: pushl %ebp
1506 ; KNL_32-NEXT: .Ltmp0:
1507 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1508 ; KNL_32-NEXT: .Ltmp1:
1509 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1510 ; KNL_32-NEXT: movl %esp, %ebp
1511 ; KNL_32-NEXT: .Ltmp2:
1512 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1513 ; KNL_32-NEXT: andl $-64, %esp
1514 ; KNL_32-NEXT: subl $64, %esp
1515 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1516 ; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
1517 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1518 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1519 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1520 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1521 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1522 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1523 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1524 ; KNL_32-NEXT: movl %ebp, %esp
1525 ; KNL_32-NEXT: popl %ebp
1528 ; SKX-LABEL: test_gather_16i64:
1530 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1531 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1532 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1533 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1534 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1535 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1536 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1537 ; SKX-NEXT: vmovaps %zmm4, %zmm1
1539 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1542 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1543 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1544 ; KNL_64-LABEL: test_gather_16f32:
1546 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1547 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1548 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1549 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1550 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1551 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1552 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1553 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1556 ; KNL_32-LABEL: test_gather_16f32:
1558 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1559 ; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
1560 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1561 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1562 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1565 ; SKX-LABEL: test_gather_16f32:
1567 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1568 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1569 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1570 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
1571 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1572 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1573 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1574 ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
1576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
1577 ret <16 x float> %res
1579 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
1580 ; KNL_64-LABEL: test_gather_16f64:
1582 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1583 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1584 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1585 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1586 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1587 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1588 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0
1589 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1
1592 ; KNL_32-LABEL: test_gather_16f64:
1594 ; KNL_32-NEXT: pushl %ebp
1595 ; KNL_32-NEXT: .Ltmp3:
1596 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1597 ; KNL_32-NEXT: .Ltmp4:
1598 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1599 ; KNL_32-NEXT: movl %esp, %ebp
1600 ; KNL_32-NEXT: .Ltmp5:
1601 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1602 ; KNL_32-NEXT: andl $-64, %esp
1603 ; KNL_32-NEXT: subl $64, %esp
1604 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1605 ; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
1606 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1607 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
1608 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1609 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
1610 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1611 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
1612 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1613 ; KNL_32-NEXT: movl %ebp, %esp
1614 ; KNL_32-NEXT: popl %ebp
1617 ; SKX-LABEL: test_gather_16f64:
1619 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1620 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1621 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1622 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1623 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1624 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1625 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1626 ; SKX-NEXT: vmovaps %zmm4, %zmm1
1628 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1629 ret <16 x double> %res
1631 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1632 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1633 ; KNL_64-LABEL: test_scatter_16i32:
1635 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1636 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1637 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1638 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1639 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1640 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1641 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1644 ; KNL_32-LABEL: test_scatter_16i32:
1646 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1647 ; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
1648 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1649 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1652 ; SKX-LABEL: test_scatter_16i32:
1654 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1655 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1656 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1657 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1658 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1659 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
1660 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1663 ; SKX_32-LABEL: test_scatter_16i32:
1665 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1666 ; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
1667 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1668 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1670 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
1673 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1674 ; KNL_64-LABEL: test_scatter_16i64:
1676 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1677 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1678 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1679 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1680 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1681 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1684 ; KNL_32-LABEL: test_scatter_16i64:
1686 ; KNL_32-NEXT: pushl %ebp
1687 ; KNL_32-NEXT: .Ltmp6:
1688 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1689 ; KNL_32-NEXT: .Ltmp7:
1690 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1691 ; KNL_32-NEXT: movl %esp, %ebp
1692 ; KNL_32-NEXT: .Ltmp8:
1693 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1694 ; KNL_32-NEXT: andl $-64, %esp
1695 ; KNL_32-NEXT: subl $64, %esp
1696 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1697 ; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
1698 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1699 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1700 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1701 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
1702 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1703 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
1704 ; KNL_32-NEXT: movl %ebp, %esp
1705 ; KNL_32-NEXT: popl %ebp
1708 ; SKX-LABEL: test_scatter_16i64:
1710 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1711 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1712 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1713 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1714 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1715 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1717 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
1720 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
1721 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1722 ; KNL_64-LABEL: test_scatter_16f32:
1724 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1725 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1726 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1727 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1728 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
1729 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
1730 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
1733 ; KNL_32-LABEL: test_scatter_16f32:
1735 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1736 ; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
1737 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1738 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
1741 ; SKX-LABEL: test_scatter_16f32:
1743 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1744 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1745 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1746 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1747 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
1748 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
1749 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
1751 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
1754 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
1755 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
1756 ; KNL_64-LABEL: test_scatter_16f64:
1758 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1759 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1760 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1761 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1762 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
1763 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
1766 ; KNL_32-LABEL: test_scatter_16f64:
1768 ; KNL_32-NEXT: pushl %ebp
1769 ; KNL_32-NEXT: .Ltmp9:
1770 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1771 ; KNL_32-NEXT: .Ltmp10:
1772 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1773 ; KNL_32-NEXT: movl %esp, %ebp
1774 ; KNL_32-NEXT: .Ltmp11:
1775 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1776 ; KNL_32-NEXT: andl $-64, %esp
1777 ; KNL_32-NEXT: subl $64, %esp
1778 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1779 ; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
1780 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1781 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
1782 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1783 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
1784 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1785 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
1786 ; KNL_32-NEXT: movl %ebp, %esp
1787 ; KNL_32-NEXT: popl %ebp
1790 ; SKX-LABEL: test_scatter_16f64:
1792 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1793 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1794 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1795 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1796 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
1797 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
1799 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
1802 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)