X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=test%2FCodeGen%2FX86%2Favx512vl-intrinsics.ll;h=8ab34bd8c436e73d714817ae12356efc1481ff10;hp=a600057c9094428f8fdb33cc265cc2489c3e808b;hb=024ff64164bd3944f0ff54061cb3ff9675b9cc06;hpb=f657b6395ac5c9fa57a3d4c871d7f32c565b11fe diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index a600057c909..8ab34bd8c43 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3010,7 +3010,9 @@ declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x doub define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: ; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[1],k1[1] ; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[1],xmm1[1] %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -3022,7 +3024,9 @@ declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x doub define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256: ; CHECK: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3] ; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -3034,7 +3038,9 @@ declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float> define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128: ; CHECK: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3] ; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3047,7 +3053,9 @@ define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; CHECK: ## BB#0: ; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7] ; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3059,7 +3067,9 @@ declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x doub define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128: ; CHECK: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0] ; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0] %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) %res2 = fadd <2 x double> %res, %res1 @@ -3071,7 +3081,9 @@ declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x doub define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256: ; CHECK: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2] ; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) %res2 = fadd <4 x double> %res, %res1 @@ -3083,7 +3095,9 @@ declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float> define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128: ; CHECK: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1] ; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) %res2 = fadd <4 x float> %res, %res1 @@ -3095,7 +3109,9 @@ declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float> define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256: ; CHECK: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5] ; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 @@ -3107,7 +3123,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128: ; CHECK: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3] ; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3119,7 +3137,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128: ; CHECK: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1] ; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) %res2 = add <4 x i32> %res, %res1 @@ -3132,7 +3152,9 @@ define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256: ; CHECK: ## BB#0: ; CHECK: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7] ; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3144,7 +3166,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256: ; CHECK: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5] ; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) %res2 = add <8 x i32> %res, %res1 @@ -3156,7 +3180,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128: ; CHECK: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[1],k1[1] ; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[1],xmm1[1] %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3168,7 +3194,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128: ; CHECK: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0] ; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0] %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) %res2 = add <2 x i64> %res, %res1 @@ -3180,7 +3208,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256: ; CHECK: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2] ; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -3192,7 +3222,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256: ; CHECK: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3] ; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xc1] +; CHECK-NEXT: ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) %res2 = add <4 x i64> %res, %res1 @@ -4694,9 +4726,12 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[0],k1[1] ; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: ## xmm3 = k1[0],xmm0[1] ; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm0 -; CHECK: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[1] +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 %x4) @@ -4715,8 +4750,10 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2] ; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm0 -; CHECK: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) @@ -4732,8 +4769,10 @@ define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x fl ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: ## xmm2 = xmm2[2,1],k1[1,0] ; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0 -; CHECK: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[2,1],xmm1[1,0] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4) %res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1) @@ -4749,8 +4788,10 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x fl ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4] ; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0 -; CHECK: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) @@ -4838,8 +4879,11 @@ define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm1[0,1,3,2] ; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = k1[0,1,3,2] ; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,1,3,2] ; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq @@ -4859,8 +4903,11 @@ define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm1[1,0] ; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = k1[1,0] ; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[1,0] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -4880,8 +4927,11 @@ define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm1[2,1,1,0,6,5,5,4] ; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = k1[2,1,1,0,6,5,5,4] ; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[2,1,1,0,6,5,5,4] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -4901,8 +4951,11 @@ define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm1[2,1,1,0] ; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = k1[2,1,1,0] ; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[2,1,1,0] ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq @@ -5184,3 +5237,1153 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 %res2 = add <4 x i64> %res, %res1 ret <4 x i64> %res2 } + +declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) + %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) + %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) + %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask) + %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res2, %res3 + ret <4 x i64> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) + %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask) + %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res2, %res3 + ret <2 x i64> %res4 +} + +define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { + ; CHECK: test_x86_vcvtph2ps_128 + ; CHECK: vcvtph2ps %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) { + ; CHECK: test_x86_vcvtph2ps_128_rrk + ; CHECK: vcvtph2ps %xmm0, %xmm1 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + + +define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) { + ; CHECK: test_x86_vcvtph2ps_128_rrkz + ; CHECK: vcvtph2ps %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly + +define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) { + ; CHECK: test_x86_vcvtph2ps_256 + ; CHECK: vcvtph2ps %xmm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) { + ; CHECK: test_x86_vcvtph2ps_256_rrk + ; CHECK: vcvtph2ps %xmm0, %ymm1 {%k1} + %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) { + ; CHECK: test_x86_vcvtph2ps_256_rrkz + ; CHECK: vcvtph2ps %xmm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly + +define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) { + ; CHECK: test_x86_vcvtps2ph_128 + ; CHECK: vcvtps2ph $2, %xmm0, %xmm0 + %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + + +declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly + +define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) { + ; CHECK: test_x86_vcvtps2ph_256 + ; CHECK: vcvtps2ph $2, %ymm0, %xmm0 + %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly + +declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) + %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) + %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} +declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) + %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) + %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} + +define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) { +; CHECK-LABEL: test_rsqrt_ps_256_rr: +; CHECK: vrsqrt14ps %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_256_rrkz: +; CHECK: vrsqrt14ps %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_256_rrk: +; CHECK: vrsqrt14ps %ymm0, %ymm1 {%k1} + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) { +; CHECK-LABEL: test_rsqrt_ps_128_rr: +; CHECK: vrsqrt14ps %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_128_rrkz: +; CHECK: vrsqrt14ps %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_128_rrk: +; CHECK: vrsqrt14ps %xmm0, %xmm1 {%k1} + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) { +; CHECK-LABEL: test_rcp_ps_256_rr: +; CHECK: vrcp14ps %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_256_rrkz: +; CHECK: vrcp14ps %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_256_rrk: +; CHECK: vrcp14ps %ymm0, %ymm1 {%k1} + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) { +; CHECK-LABEL: test_rcp_ps_128_rr: +; CHECK: vrcp14ps %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_128_rrkz: +; CHECK: vrcp14ps %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_128_rrk: +; CHECK: vrcp14ps %xmm0, %xmm1 {%k1} + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone + + +define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) { +; CHECK-LABEL: test_rsqrt_pd_256_rr: +; CHECK: vrsqrt14pd %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_256_rrkz: +; CHECK: vrsqrt14pd %ymm0, %ymm0 {%k1} {z} + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_256_rrk: +; CHECK: vrsqrt14pd %ymm0, %ymm1 {%k1} + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) { +; CHECK-LABEL: test_rsqrt_pd_128_rr: +; CHECK: vrsqrt14pd %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_128_rrkz: +; CHECK: vrsqrt14pd %xmm0, %xmm0 {%k1} {z} + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_128_rrk: +; CHECK: vrsqrt14pd %xmm0, %xmm1 {%k1} + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) { +; CHECK-LABEL: test_rcp_pd_256_rr: +; CHECK: vrcp14pd %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_256_rrkz: +; CHECK: vrcp14pd %ymm0, %ymm0 {%k1} {z} + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_256_rrk: +; CHECK: vrcp14pd %ymm0, %ymm1 {%k1} + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) { +; CHECK-LABEL: test_rcp_pd_128_rr: +; CHECK: vrcp14pd %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_128_rrkz: +; CHECK: vrcp14pd %xmm0, %xmm0 {%k1} {z} + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) + ret <2 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_128_rrk: +; CHECK: vrcp14pd %xmm0, %xmm1 {%k1} + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { +; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) + %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) + %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} +declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly + +define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 + + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) + %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) + %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} +declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly + +define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 + + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} +declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly + + +declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: +; CHECK: kmovw %eax, %k1 +; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} +; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} +; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 +; CHECK: vaddps %ymm1, %ymm0, %ymm0 +; CHECK: vaddps %ymm0, %ymm2, %ymm0 + + %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) + %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) + %res4 = fadd <8 x float> %res1, %res2 + %res5 = fadd <8 x float> %res3, %res4 + ret <8 x float> %res5 +} + +declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: +; CHECK: kmovw %eax, %k1 +; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} +; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} +; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 +; CHECK: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK: vpaddd %ymm0, %ymm2, %ymm0 + + %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) + %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + %res4 = add <8 x i32> %res1, %res2 + %res5 = add <8 x i32> %res3, %res4 + ret <8 x i32> %res5 +} + +declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res3, %res2 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i8, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 -1) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> zeroinitializer, i8 %x3) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res2, %res3 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i8, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 -1) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> zeroinitializer, i8 %x3) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res2, %res3 + ret <4 x i64> %res4 +} +declare <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i8, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 -1) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> zeroinitializer, i8 %x3) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i8, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 -1) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> zeroinitializer, i8 %x3) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res3, %res2 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i8, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i8, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res3, %res2 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i8, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res3, %res2 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i8, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} + + +declare <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i8, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1) + %res3 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i8, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z} +; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} + +declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i8, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3) + %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1) + %res3 = add <2 x i64> %res, %res1 + %res4 = add <2 x i64> %res3, %res2 + ret <2 x i64> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i8, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +}