; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
%res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
%res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
%res2 = fadd <8 x double> %res, %res1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
%res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
%res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
%res2 = fadd <16 x float> %res, %res1
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
%res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
%res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
%res2 = fadd <8 x double> %res, %res1
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
%res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
%res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
%res2 = fadd <16 x float> %res, %res1
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vpunpcklqdq {{.*#+}}
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: ## zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
+; CHECK-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm3 {%k1} {z
+; CHECK-NEXT: ## zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
+; CHECK-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpunpckhqdq {{.*#+}}
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: ## zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
+; CHECK-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpunpckhdq {{.*#+}}
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: ## zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
+; CHECK-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpckldq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vpunpckldq {{.*#+}}
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
+; CHECK-NEXT: vpunpckldq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)