From 3aac2c627be19617472274bba4066f297ac81b00 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 15 Aug 2015 13:27:30 +0000 Subject: [PATCH] [DAGCombiner] Attempt to mask vectors before zero extension instead of after. For cases where we TRUNCATE and then ZERO_EXTEND to a larger size (often from vector legalization), see if we can mask the source data and then ZERO_EXTEND (instead of after a ANY_EXTEND). This can help avoid having to generate a larger mask, and possibly applying it to several sub-vectors. (zext (truncate x)) -> (zext (and(x, m)) Includes a minor patch to SystemZ to better recognise 8/16-bit zero extension patterns from RISBG bit-extraction code. This is the first of a number of minor patches to help improve the conversion of byte masks to clear mask shuffles. Differential Revision: http://reviews.llvm.org/D11764 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245160 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 42 +++++++++++------ lib/Target/SystemZ/SystemZISelDAGToDAG.cpp | 22 +++++++-- test/CodeGen/AArch64/arm64-aapcs.ll | 15 +++--- test/CodeGen/AArch64/arm64-arith.ll | 3 +- test/CodeGen/AArch64/arm64-vector-ext.ll | 54 +++++++++++----------- test/CodeGen/AArch64/bitfield.ll | 44 ++++++++++++------ test/CodeGen/SystemZ/insert-05.ll | 4 +- test/CodeGen/X86/avx2-conversions.ll | 2 +- test/CodeGen/X86/vec_cast2.ll | 7 ++- test/CodeGen/X86/vector-zext.ll | 46 +++++++++--------- 10 files changed, 141 insertions(+), 98 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c23751cee1b..f2e4e6ba240 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6112,31 +6112,45 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } // fold (zext (truncate x)) -> (and x, mask) - if (N0.getOpcode() == ISD::TRUNCATE && - (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) { - + if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { - SDNode* oye = N0.getNode()->getOperand(0).getNode(); + SDNode *oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return SDValue(N, 0); // Return N so it doesn't get rechecked! } - SDValue Op = N0.getOperand(0); - if (Op.getValueType().bitsLT(VT)) { - Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } else if (Op.getValueType().bitsGT(VT)) { - Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); + EVT SrcVT = N0.getOperand(0).getValueType(); + EVT MinVT = N0.getValueType(); + + // Try to mask before the extension to avoid having to generate a larger mask, + // possibly over several sub-vectors. + if (SrcVT.bitsLT(VT)) { + if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && + TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { + SDValue Op = N0.getOperand(0); + Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); + AddToWorklist(Op.getNode()); + return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); + } + } + + if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { + SDValue Op = N0.getOperand(0); + if (SrcVT.bitsLT(VT)) { + Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); + AddToWorklist(Op.getNode()); + } else if (SrcVT.bitsGT(VT)) { + Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); + AddToWorklist(Op.getNode()); + } + return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); } - return DAG.getZeroExtendInReg(Op, SDLoc(N), - N0.getValueType().getScalarType()); } // Fold (zext (and (trunc x), cst)) -> (and x, cst), diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 75fd37f01a1..81a1f968023 100644 --- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -801,7 +801,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { RxSBG.Input = N.getOperand(0); return true; } - + case ISD::ANY_EXTEND: // Bits above the extended operand are don't-care. RxSBG.Input = N.getOperand(0); @@ -818,7 +818,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { return true; } // Fall through. - + case ISD::SIGN_EXTEND: { // Check that the extension bits are don't-care (i.e. are masked out // by the final mask). @@ -938,7 +938,23 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) { } return nullptr; } - } + } + + // If the RISBG operands require no rotation and just masks the bottom + // 8/16 bits, attempt to convert this to a LLC zero extension. + if (RISBG.Rotate == 0 && (RISBG.Mask == 0xff || RISBG.Mask == 0xffff)) { + unsigned OpCode = (RISBG.Mask == 0xff ? SystemZ::LLGCR : SystemZ::LLGHR); + if (VT == MVT::i32) { + if (Subtarget->hasHighWord()) + OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCRMux : SystemZ::LLHRMux); + else + OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCR : SystemZ::LLHR); + } + + SDValue In = convertTo(DL, VT, RISBG.Input); + N = CurDAG->getMachineNode(OpCode, DL, VT, In); + return convertTo(DL, VT, SDValue(N, 0)).getNode(); + } unsigned Opcode = SystemZ::RISBG; // Prefer RISBGN if available, since it does not clobber CC. diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll index f345acf453d..5d25f66b30e 100644 --- a/test/CodeGen/AArch64/arm64-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-aapcs.ll @@ -27,12 +27,13 @@ define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) { ; Check stack slots are 64-bit at all times. define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short, i32 %int, i64 %long) { - ; Part of last store. Blasted scheduler. -; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32] - %ext_bool = zext i1 %bool to i64 store volatile i64 %ext_bool, i64* @var64, align 8 ; CHECK: ldrb w[[EXT:[0-9]+]], [sp] + + ; Part of last store. Blasted scheduler. +; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32] + ; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1 ; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64] @@ -63,8 +64,8 @@ define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short, define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) { %ext_bool = zext i1 %bool to i64 store volatile i64 %ext_bool, i64* @var64 -; CHECK: and [[EXT:x[0-9]+]], x0, #0x1 -; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] +; CHECK: and w[[EXT:[0-9]+]], w0, #0x1 +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] %ext_char = sext i8 %char to i64 store volatile i64 %ext_char, i64* @var64 @@ -73,8 +74,8 @@ define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) { %ext_short = zext i16 %short to i64 store volatile i64 %ext_short, i64* @var64 -; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff -; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] +; CHECK: and w[[EXT:[0-9]+]], w2, #0xffff +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] %ext_int = zext i32 %int to i64 store volatile i64 %ext_int, i64* @var64 diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll index f36e706b15d..d5d9a1b9817 100644 --- a/test/CodeGen/AArch64/arm64-arith.ll +++ b/test/CodeGen/AArch64/arm64-arith.ll @@ -123,7 +123,8 @@ entry: define i64 @t14(i16 %a, i64 %x) nounwind ssp { entry: ; CHECK-LABEL: t14: -; CHECK: add x0, x1, w0, uxth #3 +; CHECK: and w8, w0, #0xffff +; CHECK: add x0, x1, w8, uxtw #3 ; CHECK: ret %c = zext i16 %a to i64 %d = shl i64 %c, 3 diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll index 5bee1611e6c..994a9956cf7 100644 --- a/test/CodeGen/AArch64/arm64-vector-ext.ll +++ b/test/CodeGen/AArch64/arm64-vector-ext.ll @@ -1,27 +1,27 @@ -; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s - -;CHECK: @func30 -;CHECK: ushll.4s v0, v0, #0 -;CHECK: movi.4s v1, #0x1 -;CHECK: and.16b v0, v0, v1 -;CHECK: str q0, [x0] -;CHECK: ret - -%T0_30 = type <4 x i1> -%T1_30 = type <4 x i32> -define void @func30(%T0_30 %v0, %T1_30* %p1) { - %r = zext %T0_30 %v0 to %T1_30 - store %T1_30 %r, %T1_30* %p1 - ret void -} - -; Extend from v1i1 was crashing things (PR20791). Make sure we do something -; sensible instead. -define <1 x i32> @autogen_SD7918() { -; CHECK-LABEL: autogen_SD7918 -; CHECK: movi d0, #0000000000000000 -; CHECK-NEXT: ret - %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0 - %ZE = zext <1 x i1> %I29 to <1 x i32> - ret <1 x i32> %ZE -} +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +;CHECK: @func30 +;CHECK: movi.4h v1, #0x1 +;CHECK: and.8b v0, v0, v1 +;CHECK: ushll.4s v0, v0, #0 +;CHECK: str q0, [x0] +;CHECK: ret + +%T0_30 = type <4 x i1> +%T1_30 = type <4 x i32> +define void @func30(%T0_30 %v0, %T1_30* %p1) { + %r = zext %T0_30 %v0 to %T1_30 + store %T1_30 %r, %T1_30* %p1 + ret void +} + +; Extend from v1i1 was crashing things (PR20791). Make sure we do something +; sensible instead. +define <1 x i32> @autogen_SD7918() { +; CHECK-LABEL: autogen_SD7918 +; CHECK: movi d0, #0000000000000000 +; CHECK-NEXT: ret + %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0 + %ZE = zext <1 x i1> %I29 to <1 x i32> + ret <1 x i32> %ZE +} diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll index e1e4f62f662..5f19b6943b8 100644 --- a/test/CodeGen/AArch64/bitfield.ll +++ b/test/CodeGen/AArch64/bitfield.ll @@ -3,51 +3,67 @@ @var32 = global i32 0 @var64 = global i64 0 -define void @test_extendb(i8 %var) { -; CHECK-LABEL: test_extendb: +define void @test_extendb32(i8 %var) { +; CHECK-LABEL: test_extendb32: %sxt32 = sext i8 %var to i32 store volatile i32 %sxt32, i32* @var32 ; CHECK: sxtb {{w[0-9]+}}, {{w[0-9]+}} - %sxt64 = sext i8 %var to i64 - store volatile i64 %sxt64, i64* @var64 -; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}} - ; N.b. this doesn't actually produce a bitfield instruction at the ; moment, but it's still a good test to have and the semantics are ; correct. %uxt32 = zext i8 %var to i32 store volatile i32 %uxt32, i32* @var32 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff + ret void +} + +define void @test_extendb64(i8 %var) { +; CHECK-LABEL: test_extendb64: + + %sxt64 = sext i8 %var to i64 + store volatile i64 %sxt64, i64* @var64 +; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}} +; N.b. this doesn't actually produce a bitfield instruction at the +; moment, but it's still a good test to have and the semantics are +; correct. %uxt64 = zext i8 %var to i64 store volatile i64 %uxt64, i64* @var64 -; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff +; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff ret void } -define void @test_extendh(i16 %var) { -; CHECK-LABEL: test_extendh: +define void @test_extendh32(i16 %var) { +; CHECK-LABEL: test_extendh32: %sxt32 = sext i16 %var to i32 store volatile i32 %sxt32, i32* @var32 ; CHECK: sxth {{w[0-9]+}}, {{w[0-9]+}} - %sxt64 = sext i16 %var to i64 - store volatile i64 %sxt64, i64* @var64 -; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}} - ; N.b. this doesn't actually produce a bitfield instruction at the ; moment, but it's still a good test to have and the semantics are ; correct. %uxt32 = zext i16 %var to i32 store volatile i32 %uxt32, i32* @var32 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff + ret void +} + +define void @test_extendh64(i16 %var) { +; CHECK-LABEL: test_extendh64: + + %sxt64 = sext i16 %var to i64 + store volatile i64 %sxt64, i64* @var64 +; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}} +; N.b. this doesn't actually produce a bitfield instruction at the +; moment, but it's still a good test to have and the semantics are +; correct. %uxt64 = zext i16 %var to i64 store volatile i64 %uxt64, i64* @var64 -; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff +; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff ret void } diff --git a/test/CodeGen/SystemZ/insert-05.ll b/test/CodeGen/SystemZ/insert-05.ll index b76859a568f..1ea8a64e28e 100644 --- a/test/CodeGen/SystemZ/insert-05.ll +++ b/test/CodeGen/SystemZ/insert-05.ll @@ -214,8 +214,8 @@ define i64 @f18(i32 %a) { ; The truncation here isn't free; we need an explicit zero extension. define i64 @f19(i32 %a) { ; CHECK-LABEL: f19: -; CHECK: llgcr %r2, %r2 -; CHECK: oihl %r2, 1 +; CHECK: llcr %r2, %r2 +; CHECK: iihf %r2, 1 ; CHECK: br %r14 %trunc = trunc i32 %a to i8 %ext = zext i8 %trunc to i64 diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll index f97a8ff69ee..402e2e04e62 100644 --- a/test/CodeGen/X86/avx2-conversions.ll +++ b/test/CodeGen/X86/avx2-conversions.ll @@ -61,8 +61,8 @@ define <8 x i32> @zext8(<8 x i16> %A) nounwind { define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { ; CHECK-LABEL: zext_8i8_8i32: ; CHECK: ## BB#0: +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> ret <8 x i32>%B diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index 7c039bc9850..311c7870b4b 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -46,11 +46,10 @@ define <4 x float> @foo1_4(<4 x i8> %src) { define <8 x float> @foo2_8(<8 x i8> %src) { ; CHECK-LABEL: foo2_8: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpand LCPI2_0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 04ace30b819..752eef68fbc 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -910,50 +910,46 @@ entry: define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { ; SSE2-LABEL: zext_8i8_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i8_to_8i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i8_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: ; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %t = zext <8 x i8> %z to <8 x i32> -- 2.34.1