+
+define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
+; KNL_64-LABEL: test_gather_16i32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16i32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_gather_16i32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
+; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: retl
+ %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
+ ret <16 x i32> %res
+}
+define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; KNL_64-LABEL: test_gather_16i64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT: vmovaps %zmm3, %zmm0
+; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16i64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+ ret <16 x i64> %res
+}
+declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
+; KNL_64-LABEL: test_gather_16f32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16f32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16f32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: retq
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+ ret <16 x float> %res
+}
+define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; KNL_64-LABEL: test_gather_16f64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
+; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; KNL_64-NEXT: vmovaps %zmm3, %zmm0
+; KNL_64-NEXT: vmovaps %zmm4, %zmm1
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_gather_16f64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp4:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp5:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
+; KNL_32-NEXT: vmovaps %zmm2, %zmm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_gather_16f64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
+; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vmovaps %zmm4, %zmm1
+; SKX-NEXT: retq
+ %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+ ret <16 x double> %res
+}
+declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
+; KNL_64-LABEL: test_scatter_16i32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16i32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_16i32:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
+; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: retl
+ call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
+; KNL_64-LABEL: test_scatter_16i64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16i64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp6:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp7:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp8:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
+; KNL_64-LABEL: test_scatter_16f32:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
+; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16f32:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16f32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
+define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
+; KNL_64-LABEL: test_scatter_16f64:
+; KNL_64: # BB#0:
+; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
+; KNL_64-NEXT: kshiftrw $8, %k1, %k2
+; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
+; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_16f64:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Ltmp9:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Ltmp10:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Ltmp11:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-64, %esp
+; KNL_32-NEXT: subl $64, %esp
+; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
+; KNL_32-NEXT: kshiftrw $8, %k1, %k2
+; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
+; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_16f64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
+; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
+; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: retq
+ call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)