From: Bruno Cardoso Lopes Date: Fri, 12 Aug 2011 21:48:26 +0000 (+0000) Subject: The VPERM2F128 is a AVX instruction which permutes between two 256-bit X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=53cae1362dca8aa312c3e36c10b106ea7d349f93;p=oota-llvm.git The VPERM2F128 is a AVX instruction which permutes between two 256-bit vectors. It operates on 128-bit elements instead of regular scalar types. Recognize shuffles that are suitable for VPERM2F128 and teach the x86 legalizer how to handle them. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137519 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index cb4996bba11..12f71746719 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -225,6 +225,11 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERM2F128rr: + DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + break; } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index fe0b3a20c1f..aeb3309d09a 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -220,4 +220,24 @@ void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm, } } +void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned HalfSize = VT.getVectorNumElements()/2; + unsigned FstHalfBegin = (Imm & 0x3) * HalfSize; + unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize; + + for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); + for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); +} + +void DecodeVPERM2F128Mask(unsigned Imm, + SmallVectorImpl &ShuffleMask) { + // VPERM2F128 is used by any 256-bit EVT, but X86InstComments only + // has information about the instruction and not the types. So for + // instruction comments purpose, assume the 256-bit vector is v4i64. + return DecodeVPERM2F128Mask(MVT::v4i64, Imm, ShuffleMask); +} + } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 1b11c6e3c26..58193e6a468 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -97,6 +97,11 @@ void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, SmallVectorImpl &ShuffleMask); +void DecodeVPERM2F128Mask(unsigned Imm, + SmallVectorImpl &ShuffleMask); +void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask); + } // llvm namespace #endif diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 10ab70750e3..4c83a728a55 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2753,6 +2753,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPSY: case X86ISD::VPERMILPD: case X86ISD::VPERMILPDY: + case X86ISD::VPERM2F128: return true; } return false; @@ -2795,6 +2796,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PALIGN: case X86ISD::SHUFPD: case X86ISD::SHUFPS: + case X86ISD::VPERM2F128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3033,6 +3035,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) { return false; } +/// isUndefOrInRange - Return true if every element in Mask, begining from +/// position Pos and ending in Pos+Size, falls within the specified sequential +/// range (L, L+Pos]. or is undef. +static bool isSequentialOrUndefInRange(const SmallVectorImpl &Mask, + int Pos, int Size, int Low) { + for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) + if (!isUndefOrEqual(Mask[i], Low)) + return false; + return true; +} + /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. @@ -3444,6 +3457,67 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } +/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// as permutations between 128-bit chunks or halves. As an example: this +/// shuffle bellow: +/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> +/// The first half comes from the second half of V1 and the second half from the +/// the second half of V2. +static bool isVPERM2F128Mask(const SmallVectorImpl &Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + // The shuffle result is divided into half A and half B. In total the two + // sources have 4 halves, namely: C, D, E, F. The final values of A and + // B must come from C, D, E or F. + int HalfSize = VT.getVectorNumElements()/2; + bool MatchA = false, MatchB = false; + + // Check if A comes from one of C, D, E, F. + for (int Half = 0; Half < 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { + MatchA = true; + break; + } + } + + // Check if B comes from one of C, D, E, F. + for (int Half = 0; Half < 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { + MatchB = true; + break; + } + } + + return MatchA && MatchB; +} + +/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128 instructions. +static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast(N); + EVT VT = SVOp->getValueType(0); + + int HalfSize = VT.getVectorNumElements()/2; + + int FstHalf = 0, SndHalf = 0; + for (int i = 0; i < HalfSize; ++i) { + if (SVOp->getMaskElt(i) > 0) { + FstHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + for (int i = HalfSize; i < HalfSize*2; ++i) { + if (SVOp->getMaskElt(i) > 0) { + SndHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + + return (FstHalf | (SndHalf << 4)); +} + /// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying @@ -4317,6 +4391,11 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, DecodeVPERMILPDMask(4, cast(ImmN)->getZExtValue(), ShuffleMask); break; + case X86ISD::VPERM2F128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2F128Mask(VT, cast(ImmN)->getZExtValue(), + ShuffleMask); + break; default: assert("not implemented for target shuffle node"); return SDValue(); @@ -6335,6 +6414,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, getShuffleVPERMILPDImmediate(SVOp), DAG); + // Handle VPERM2F128 permutations + if (isVPERM2F128Mask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, + getShuffleVPERM2F128Immediate(SVOp), DAG); + //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but @@ -10052,6 +10136,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; + case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -13134,6 +13219,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILPSY: case X86ISD::VPERMILPD: case X86ISD::VPERMILPDY: + case X86ISD::VPERM2F128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 76267c10ba2..0aaef2a84d0 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -275,6 +275,7 @@ namespace llvm { VPERMILPSY, VPERMILPD, VPERMILPDY, + VPERM2F128, // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 670bbc5763b..2ccf4c4785b 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -158,6 +158,8 @@ def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>; def X86VPermilpd : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>; def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>; +def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a8d6a36a1f1..2f8b2a701de 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5695,6 +5695,19 @@ def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, (memopv8i32 addr:$src2), imm:$src3), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; + //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll new file mode 100644 index 00000000000..3550a908231 --- /dev/null +++ b/test/CodeGen/X86/avx-vperm2f128.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; CHECK: vperm2f128 $1 +define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +; CHECK: vperm2f128 $48 +define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +; CHECK: vperm2f128 $0 +define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +; CHECK: vperm2f128 $17 +define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +; CHECK: vperm2f128 $17 +define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +; CHECK: vperm2f128 $33 +define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + +;;;; Cases with undef indicies mixed in the mask + +; CHECK: vperm2f128 $33 +define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +;;;; Cases we must not select vperm2f128 + +; CHECK: _G +; CHECK-NOT: vperm2f128 +define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +}