From 65b74e1d00eef81b596b4c207fba069aa1eb8214 Mon Sep 17 00:00:00 2001 From: Bruno Cardoso Lopes Date: Thu, 21 Jul 2011 01:55:47 +0000 Subject: [PATCH] Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135662 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/InstPrinter/X86InstComments.cpp | 10 +++ lib/Target/X86/Utils/X86ShuffleDecode.cpp | 27 ++++++++ lib/Target/X86/Utils/X86ShuffleDecode.h | 14 +++++ lib/Target/X86/X86ISelLowering.cpp | 63 +++++++++++++++++++ lib/Target/X86/X86ISelLowering.h | 1 + lib/Target/X86/X86InstrFragmentsSIMD.td | 2 + lib/Target/X86/X86InstrSSE.td | 4 ++ test/CodeGen/X86/avx-256-splat.ll | 16 +++++ 8 files changed, 137 insertions(+) create mode 100644 test/CodeGen/X86/avx-256-splat.ll diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 4e28dfe7fa8..b2f246e4419 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -205,6 +205,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeUNPCKHPMask(4, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERMILPSYri: + DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPDYri: + DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index cd06060748b..c1ff0e5011e 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -187,4 +187,31 @@ void DecodeUNPCKLPMask(EVT VT, } } +void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + DecodeVPERMILMask(MVT::getVectorVT(MVT::i32, NElts), Imm, ShuffleMask); +} + +void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + DecodeVPERMILMask(MVT::getVectorVT(MVT::i64, NElts), Imm, ShuffleMask); +} + +// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit +// with 32/64-bit elements. For 256-bit vectors, it's considered +// as two 128 lanes and the mask of the first lane should be +// identical of the second one. +void DecodeVPERMILMask(EVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = 0; i != NumElts/NumLanes; ++i) { + unsigned Idx = (Imm >> (i*2)) & 0x3 ; + ShuffleMask.push_back(Idx+(l*NumElts/NumLanes)); + } + } +} + } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index b18f6703309..4a5214028fa 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -82,6 +82,20 @@ void DecodeUNPCKLPDMask(unsigned NElts, void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl &ShuffleMask); + +void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask); + +void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask); + +// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit +// with 32/64-bit elements. For 256-bit vectors, it's considered +// as two 128 lanes and the mask of the first lane should be +// identical of the second one. +void DecodeVPERMILMask(EVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask); + } // llvm namespace #endif diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c4e44f999d0..1c827f0dd30 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2747,6 +2747,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PUNPCKHBW: case X86ISD::PUNPCKHDQ: case X86ISD::PUNPCKHQDQ: + case X86ISD::VPERMIL: return true; } return false; @@ -2772,6 +2773,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: + case X86ISD::VPERMIL: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3422,6 +3424,54 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } +/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMIL*. +static bool isVPERMILMask(const SmallVectorImpl &Mask, EVT VT) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + + // Match any permutation of 128-bit vector with 32/64-bit types + if (NumLanes == 1) { + if (NumElts == 4 || NumElts == 2) + return true; + return false; + } + + // Only match 256-bit with 32/64-bit types + if (NumElts != 8 && NumElts != 4) + return false; + + // The mask on the high lane should be the same as the low. Actually, + // they can differ if any of the corresponding index in a lane is undef. + int LaneSize = NumElts/NumLanes; + for (int i = 0; i < LaneSize; ++i) { + int HighElt = i+LaneSize; + if (Mask[i] < 0 || Mask[HighElt] < 0) + continue; + + if (Mask[HighElt]-Mask[i] != LaneSize) + return false; + } + + return true; +} + +/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMIL* instructions. +static unsigned getShuffleVPERMILImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast(N); + EVT VT = SVOp->getValueType(0); + + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + + unsigned Mask = 0; + for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i) + Mask |= SVOp->getMaskElt(i) << (i*2); + + return Mask; +} + /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. @@ -4097,6 +4147,10 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, Depth+1); } + case X86ISD::VPERMIL: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMILMask(VT, cast(ImmN)->getZExtValue(), + ShuffleMask); default: assert("not implemented for target shuffle node"); return SDValue(); @@ -6043,6 +6097,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (NumElems == 4) return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); + // Handle VPERMIL permutations + if (isVPERMILMask(M, VT)) { + unsigned TargetMask = getShuffleVPERMILImmediate(SVOp); + if (VT == MVT::v8f32) + return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG); + } + return SDValue(); } @@ -9660,6 +9721,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::VPERMIL: return "X86ISD::VPERMIL"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -12465,6 +12527,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VPERMIL: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 376aa8a4409..298d45110ff 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -271,6 +271,7 @@ namespace llvm { PUNPCKHWD, PUNPCKHDQ, PUNPCKHQDQ, + VPERMIL, // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index b00109c9fa4..f792f53b7ba 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -150,6 +150,8 @@ def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>; def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>; def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>; +def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ae26a80fe2d..73e465fe29e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5529,6 +5529,10 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", // The AVX version of some but not all of them are described here, and more // should come in a near future. +// Shuffle with VPERMIL instructions +def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; + // Shuffle with PSHUFD instruction folding loads. The first two patterns match // SSE2 loads, which are always promoted to v2i64. The last one should match // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD diff --git a/test/CodeGen/X86/avx-256-splat.ll b/test/CodeGen/X86/avx-256-splat.ll new file mode 100644 index 00000000000..27ff9268b40 --- /dev/null +++ b/test/CodeGen/X86/avx-256-splat.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; FIXME: use avx versions for punpcklbw and punpckhbw + +; CHECK: vextractf128 $0 +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: vinsertf128 $0 +; CHECK-NEXT: vinsertf128 $1 +; CHECK-NEXT: vpermilps $85 +define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> + ret <32 x i8> %shuffle +} + -- 2.34.1