1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
5 define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
6 ; CHECK-LABEL: test_pcmpeq_b_256
7 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
8 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
12 define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
13 ; CHECK-LABEL: test_mask_pcmpeq_b_256
14 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
15 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
19 declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
21 define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
22 ; CHECK-LABEL: test_pcmpeq_w_256
23 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
24 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
28 define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
29 ; CHECK-LABEL: test_mask_pcmpeq_w_256
30 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
31 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
35 declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
37 define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
38 ; CHECK-LABEL: test_pcmpgt_b_256
39 ; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
40 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
44 define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
45 ; CHECK-LABEL: test_mask_pcmpgt_b_256
46 ; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
47 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
51 declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
53 define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
54 ; CHECK-LABEL: test_pcmpgt_w_256
55 ; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
56 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
60 define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
61 ; CHECK-LABEL: test_mask_pcmpgt_w_256
62 ; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
63 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
67 declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
69 define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
70 ; CHECK_LABEL: test_cmp_b_256
71 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
72 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 -1)
73 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
74 ; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
75 %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 -1)
76 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
77 ; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
78 %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 -1)
79 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
80 ; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
81 %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 -1)
82 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
83 ; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
84 %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 -1)
85 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
86 ; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
87 %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 -1)
88 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
89 ; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
90 %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 -1)
91 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
92 ; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
93 %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 -1)
94 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
98 define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
99 ; CHECK_LABEL: test_mask_cmp_b_256
100 ; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
101 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 %mask)
102 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
103 ; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
104 %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 %mask)
105 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
106 ; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
107 %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 %mask)
108 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
109 ; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
110 %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 %mask)
111 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
112 ; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
113 %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 %mask)
114 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
115 ; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
116 %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 %mask)
117 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
118 ; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
119 %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 %mask)
120 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
121 ; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
122 %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 %mask)
123 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
127 declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i8, i32) nounwind readnone
129 define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
130 ; CHECK_LABEL: test_ucmp_b_256
131 ; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
132 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 -1)
133 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
134 ; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
135 %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 -1)
136 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
137 ; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
138 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 -1)
139 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
140 ; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
141 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 -1)
142 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
143 ; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
144 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 -1)
145 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
146 ; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
147 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 -1)
148 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
149 ; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
150 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 -1)
151 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
152 ; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
153 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 -1)
154 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
158 define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
159 ; CHECK_LABEL: test_mask_ucmp_b_256
160 ; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
161 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 0, i32 %mask)
162 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
163 ; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
164 %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 1, i32 %mask)
165 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
166 ; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
167 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 2, i32 %mask)
168 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
169 ; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
170 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 3, i32 %mask)
171 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
172 ; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
173 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 4, i32 %mask)
174 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
175 ; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
176 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 5, i32 %mask)
177 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
178 ; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
179 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 6, i32 %mask)
180 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
181 ; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
182 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i8 7, i32 %mask)
183 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
187 declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i8, i32) nounwind readnone
189 define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
190 ; CHECK_LABEL: test_cmp_w_256
191 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
192 %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 -1)
193 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
194 ; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
195 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 -1)
196 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
197 ; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
198 %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 -1)
199 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
200 ; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
201 %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 -1)
202 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
203 ; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
204 %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 -1)
205 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
206 ; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
207 %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 -1)
208 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
209 ; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
210 %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 -1)
211 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
212 ; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
213 %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 -1)
214 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
218 define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
219 ; CHECK_LABEL: test_mask_cmp_w_256
220 ; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
221 %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 %mask)
222 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
223 ; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
224 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 %mask)
225 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
226 ; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
227 %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 %mask)
228 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
229 ; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
230 %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 %mask)
231 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
232 ; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
233 %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 %mask)
234 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
235 ; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
236 %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 %mask)
237 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
238 ; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
239 %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 %mask)
240 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
241 ; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
242 %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 %mask)
243 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
247 declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i8, i16) nounwind readnone
249 define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
250 ; CHECK_LABEL: test_ucmp_w_256
251 ; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
252 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 -1)
253 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
254 ; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
255 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 -1)
256 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
257 ; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
258 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 -1)
259 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
260 ; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
261 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 -1)
262 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
263 ; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
264 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 -1)
265 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
266 ; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
267 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 -1)
268 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
269 ; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
270 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 -1)
271 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
272 ; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
273 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 -1)
274 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
278 define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
279 ; CHECK_LABEL: test_mask_ucmp_w_256
280 ; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
281 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 0, i16 %mask)
282 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
283 ; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
284 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 1, i16 %mask)
285 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
286 ; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
287 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 2, i16 %mask)
288 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
289 ; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
290 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 3, i16 %mask)
291 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
292 ; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
293 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 4, i16 %mask)
294 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
295 ; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
296 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 5, i16 %mask)
297 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
298 ; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
299 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 6, i16 %mask)
300 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
301 ; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
302 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i8 7, i16 %mask)
303 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
307 declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i8, i16) nounwind readnone
311 define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
312 ; CHECK-LABEL: test_pcmpeq_b_128
313 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
314 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
318 define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
319 ; CHECK-LABEL: test_mask_pcmpeq_b_128
320 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
321 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
325 declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
327 define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
328 ; CHECK-LABEL: test_pcmpeq_w_128
329 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
330 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
334 define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
335 ; CHECK-LABEL: test_mask_pcmpeq_w_128
336 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
337 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
341 declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
343 define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
344 ; CHECK-LABEL: test_pcmpgt_b_128
345 ; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
346 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
350 define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
351 ; CHECK-LABEL: test_mask_pcmpgt_b_128
352 ; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
353 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
357 declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
359 define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
360 ; CHECK-LABEL: test_pcmpgt_w_128
361 ; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
362 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
366 define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
367 ; CHECK-LABEL: test_mask_pcmpgt_w_128
368 ; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
369 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
373 declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
375 define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
376 ; CHECK_LABEL: test_cmp_b_128
377 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
378 %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 -1)
379 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
380 ; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
381 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 -1)
382 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
383 ; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
384 %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 -1)
385 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
386 ; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
387 %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 -1)
388 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
389 ; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
390 %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 -1)
391 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
392 ; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
393 %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 -1)
394 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
395 ; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
396 %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 -1)
397 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
398 ; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
399 %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 -1)
400 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
404 define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
405 ; CHECK_LABEL: test_mask_cmp_b_128
406 ; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
407 %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 %mask)
408 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
409 ; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
410 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 %mask)
411 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
412 ; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
413 %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 %mask)
414 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
415 ; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
416 %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 %mask)
417 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
418 ; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
419 %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 %mask)
420 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
421 ; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
422 %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 %mask)
423 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
424 ; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
425 %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 %mask)
426 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
427 ; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
428 %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 %mask)
429 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
433 declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i8, i16) nounwind readnone
435 define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
436 ; CHECK_LABEL: test_ucmp_b_128
437 ; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
438 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 -1)
439 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
440 ; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
441 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 -1)
442 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
443 ; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
444 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 -1)
445 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
446 ; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
447 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 -1)
448 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
449 ; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
450 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 -1)
451 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
452 ; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
453 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 -1)
454 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
455 ; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
456 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 -1)
457 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
458 ; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
459 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 -1)
460 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
464 define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
465 ; CHECK_LABEL: test_mask_ucmp_b_128
466 ; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
467 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 0, i16 %mask)
468 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
469 ; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
470 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 1, i16 %mask)
471 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
472 ; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
473 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 2, i16 %mask)
474 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
475 ; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
476 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 3, i16 %mask)
477 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
478 ; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
479 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 4, i16 %mask)
480 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
481 ; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
482 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 5, i16 %mask)
483 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
484 ; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
485 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 6, i16 %mask)
486 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
487 ; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
488 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i8 7, i16 %mask)
489 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
493 declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i8, i16) nounwind readnone
495 define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
496 ; CHECK_LABEL: test_cmp_w_128
497 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
498 %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 -1)
499 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
500 ; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
501 %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 -1)
502 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
503 ; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
504 %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 -1)
505 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
506 ; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
507 %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 -1)
508 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
509 ; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
510 %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 -1)
511 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
512 ; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
513 %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 -1)
514 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
515 ; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
516 %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 -1)
517 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
518 ; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
519 %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 -1)
520 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
524 define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
525 ; CHECK_LABEL: test_mask_cmp_w_128
526 ; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
527 %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 %mask)
528 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
529 ; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
530 %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 %mask)
531 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
532 ; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
533 %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 %mask)
534 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
535 ; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
536 %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 %mask)
537 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
538 ; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
539 %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 %mask)
540 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
541 ; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
542 %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 %mask)
543 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
544 ; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
545 %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 %mask)
546 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
547 ; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
548 %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 %mask)
549 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
553 declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i8, i8) nounwind readnone
555 define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
556 ; CHECK_LABEL: test_ucmp_w_128
557 ; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
558 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 -1)
559 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
560 ; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
561 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 -1)
562 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
563 ; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
564 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 -1)
565 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
566 ; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
567 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 -1)
568 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
569 ; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
570 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 -1)
571 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
572 ; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
573 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 -1)
574 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
575 ; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
576 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 -1)
577 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
578 ; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
579 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 -1)
580 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
584 define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
585 ; CHECK_LABEL: test_mask_ucmp_w_128
586 ; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
587 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 0, i8 %mask)
588 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
589 ; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
590 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 1, i8 %mask)
591 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
592 ; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
593 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 2, i8 %mask)
594 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
595 ; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
596 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 3, i8 %mask)
597 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
598 ; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
599 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 4, i8 %mask)
600 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
601 ; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
602 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 5, i8 %mask)
603 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
604 ; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
605 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 6, i8 %mask)
606 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
607 ; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
608 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i8 7, i8 %mask)
609 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
613 declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i8, i8) nounwind readnone
615 declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
617 define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
618 ; CHECK-LABEL: test_mask_vfmadd256_ps
619 ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
620 %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
624 declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
626 define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
627 ; CHECK-LABEL: test_mask_vfmadd128_ps
628 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
629 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
633 declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
635 define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
636 ; CHECK-LABEL: test_mask_fmadd256_pd:
637 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
638 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
639 ret <4 x double> %res
642 declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
644 define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
645 ; CHECK-LABEL: test_mask_fmadd128_pd:
646 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
647 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
648 ret <2 x double> %res
651 declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
653 define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
654 ; CHECK-LABEL: test_mask_vfmsub256_ps
655 ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2]
656 %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
660 declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
662 define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
663 ; CHECK-LABEL: test_mask_vfmsub128_ps
664 ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2]
665 %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
669 declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
671 define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
672 ; CHECK-LABEL: test_mask_vfmsub256_pd
673 ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2]
674 %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
675 ret <4 x double> %res
678 declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
680 define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
681 ; CHECK-LABEL: test_mask_vfmsub128_pd
682 ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2]
683 %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
684 ret <2 x double> %res
687 declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
689 define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
690 ; CHECK-LABEL: test_mask_vfnmadd256_ps
691 ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
692 %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
696 declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
698 define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
699 ; CHECK-LABEL: test_mask_vfnmadd128_ps
700 ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
701 %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
705 declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
707 define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
708 ; CHECK-LABEL: test_mask_vfnmadd256_pd
709 ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
710 %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
711 ret <4 x double> %res
714 declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
716 define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
717 ; CHECK-LABEL: test_mask_vfnmadd128_pd
718 ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
719 %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
720 ret <2 x double> %res
723 declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
725 define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
726 ; CHECK-LABEL: test_mask_vfnmsub256_ps
727 ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
728 %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
732 declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
734 define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
735 ; CHECK-LABEL: test_mask_vfnmsub128_ps
736 ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
737 %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
741 declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
743 define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
744 ; CHECK-LABEL: test_mask_vfnmsub256_pd
745 ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
746 %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
747 ret <4 x double> %res
750 declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
752 define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
753 ; CHECK-LABEL: test_mask_vfnmsub128_pd
754 ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
755 %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
756 ret <2 x double> %res
759 declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
761 define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
762 ; CHECK-LABEL: test_mask_fmaddsub256_ps:
763 ; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
764 %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
768 declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
770 define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
771 ; CHECK-LABEL: test_mask_fmaddsub128_ps:
772 ; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
773 %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
777 declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
779 define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
780 ; CHECK-LABEL: test_mask_vfmaddsub256_pd
781 ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
782 %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
783 ret <4 x double> %res
786 declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
788 define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
789 ; CHECK-LABEL: test_mask_vfmaddsub128_pd
790 ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
791 %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
792 ret <2 x double> %res
795 declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
797 define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
798 ; CHECK-LABEL: test_mask_vfmsubadd256_ps
799 ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2]
800 %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
804 declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
806 define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
807 ; CHECK-LABEL: test_mask_vfmsubadd128_ps
808 ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2]
809 %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
813 declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
815 define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
816 ; CHECK-LABEL: test_mask_vfmsubadd256_pd
817 ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2]
818 %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
819 ret <4 x double> %res
821 declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
823 define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
824 ; CHECK-LABEL: test_mask_vfmsubadd128_pd
825 ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2]
826 %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
827 ret <2 x double> %res
830 define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
831 ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd
832 ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07]
833 %a2 = load <2 x double>* %ptr_a2
834 %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
835 ret <2 x double> %res
837 declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
838 define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) {
839 ; CHECK-LABEL: test_mask_vfmsubaddrm_pd
840 ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07]
841 %a2 = load <8 x double>* %ptr_a2, align 8
842 %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
843 ret <8 x double> %res
846 define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
847 ; CHECK-LABEL: test_mask_vfmadd128_ps_r
848 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
849 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
853 define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
854 ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
855 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
856 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
860 define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
861 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
862 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
863 %a2 = load <4 x float>* %ptr_a2
864 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
868 define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
869 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
870 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
871 %a2 = load <4 x float>* %ptr_a2, align 8
872 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
876 define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
877 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
878 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
879 %a2 = load <4 x float>* %ptr_a2
880 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
884 define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
885 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
886 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
887 %a2 = load <4 x float>* %ptr_a2, align 4
888 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
892 define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
893 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
894 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
895 %q = load float* %ptr_a2
896 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
897 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
898 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
899 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
900 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
904 define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
905 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
906 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
907 %q = load float* %ptr_a2, align 4
908 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
909 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
910 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
911 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
912 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
916 define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
917 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
918 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
919 %q = load float* %ptr_a2
920 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
921 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
922 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
923 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
924 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
928 define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
929 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
930 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
931 %q = load float* %ptr_a2, align 4
932 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
933 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
934 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
935 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
936 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
940 define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
941 ; CHECK-LABEL: test_mask_vfmadd128_pd_r
942 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
943 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
944 ret <2 x double> %res
947 define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
948 ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
949 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
950 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
951 ret <2 x double> %res
954 define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
955 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
956 ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
957 %a2 = load <2 x double>* %ptr_a2
958 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
959 ret <2 x double> %res
962 define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
963 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
964 ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
965 %a2 = load <2 x double>* %ptr_a2
966 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
967 ret <2 x double> %res
970 define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
971 ; CHECK-LABEL: test_mask_vfmadd256_pd_r
972 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
973 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
974 ret <4 x double> %res
977 define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
978 ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
979 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
980 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
981 ret <4 x double> %res
984 define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
985 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
986 ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
987 %a2 = load <4 x double>* %ptr_a2
988 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
989 ret <4 x double> %res
992 define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
993 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
994 ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
995 %a2 = load <4 x double>* %ptr_a2
996 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
997 ret <4 x double> %res