From: Elena Demikhovsky Date: Wed, 21 Aug 2013 09:36:02 +0000 (+0000) Subject: AVX-512: Added SHIFT instructions. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=8ba76daba09e79b10c4aad8f4298433c6dafa6d5 AVX-512: Added SHIFT instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188899 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3c3f09f0fe1..6e9ecef74d1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -11269,6 +11269,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_avx512_kortestz: + case Intrinsic::x86_avx512_kortestc: { + unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; + SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); + SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } // SSE/AVX shift intrinsics case Intrinsic::x86_sse2_psll_w: @@ -12135,7 +12145,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && - (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || + (Subtarget->hasAVX512() && + (VT == MVT::v8i64 || VT == MVT::v16i32))) { if (Op.getOpcode() == ISD::SHL) return DAG.getNode(X86ISD::VSHLI, dl, VT, R, DAG.getConstant(ShiftAmt, MVT::i32)); @@ -12297,7 +12309,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || - VT == MVT::v8i32 || VT == MVT::v16i16))) { + VT == MVT::v8i32 || VT == MVT::v16i16)) || + (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); @@ -12365,6 +12378,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRA: @@ -12374,6 +12389,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v8i16: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRL: @@ -12385,6 +12402,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); } } @@ -12393,7 +12412,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || + (Subtarget->hasAVX512() && VT == MVT::v8i64)) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); @@ -12442,6 +12462,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (V.getNode()) return V; + if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) + return Op; // AVX2 has VPSLLV/VPSRAV/VPSRLV. if (Subtarget->hasInt256()) { if (Op.getOpcode() == ISD::SRL && @@ -13350,6 +13372,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::TESTM: return "X86ISD::TESTM"; + case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index eafe027a558..40b2a9ce767 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -274,7 +274,7 @@ namespace llvm { // PCMP* - Vector integer comparisons. PCMPEQ, PCMPGT, - // PCMP*M - Vector integer comparisons, the result is in a mask vector + // PCMP*M - Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, /// CMPM, CMPMU - Vector comparison generating mask bits for fp and @@ -295,12 +295,15 @@ namespace llvm { // MUL_IMM - X86 specific multiply by immediate. MUL_IMM, - // PTEST - Vector bitwise comparisons + // PTEST - Vector bitwise comparisons. PTEST, - // TESTP - Vector packed fp sign bitwise comparisons + // TESTP - Vector packed fp sign bitwise comparisons. TESTP, + // TESTM - Vector "test" in AVX-512, the result is in a mask vector. + TESTM, + // OR/AND test for masks KORTEST, KTEST, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index ccbd18edde2..c3fb8019cc9 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1691,3 +1691,144 @@ defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, SSE_ALU_ITINS_P.d, 0>, EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +//===----------------------------------------------------------------------===// +// AVX-512 VPTESTM instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, + SDNode OpNode, ValueType vt> { + def rr : AVX5128I, EVEX_4V; + def rm : AVX5128I, EVEX_4V; +} + +defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, + memopv16i32, X86testm, v16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, memopv8i64, + X86testm, v8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 Shift instructions +//===----------------------------------------------------------------------===// +multiclass avx512_shift_rmi opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, + SDNode OpNode, RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag> { + def ri : AVX512BIi8, EVEX_4V; + def mi: AVX512BIi8, EVEX_4V; +} + +multiclass avx512_shift_rrm opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, ValueType SrcVT, + PatFrag bc_frag> { + // src2 is always 128-bit + def rr : AVX512BI, EVEX_4V; + def rm : AVX512BI, EVEX_4V; +} + +defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, + VR512, v16i32, i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, + VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512, + EVEX_CD8<32, CD8VQ>; + +defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, + VR512, v8i64, i512mem, memopv8i64>, EVEX_V512, + EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, + VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512, + EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, + VR512, v16i32, i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, + VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512, + EVEX_CD8<32, CD8VQ>; + +defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, + VR512, v8i64, i512mem, memopv8i64>, EVEX_V512, + EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, + VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512, + EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, + VR512, v16i32, i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, + VR512, v16i32, v4i32, bc_v4i32>, EVEX_V512, + EVEX_CD8<32, CD8VQ>; + +defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, + VR512, v8i64, i512mem, memopv8i64>, EVEX_V512, + EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, + VR512, v8i64, v2i64, bc_v2i64>, EVEX_V512, + EVEX_CD8<64, CD8VQ>, VEX_W; + +//===-------------------------------------------------------------------===// +// Variable Bit Shifts +//===-------------------------------------------------------------------===// +multiclass avx512_var_shift opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag> { + def rr : AVX5128I, + EVEX_4V; + def rm : AVX5128I, + EVEX_4V; +} + +defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, + i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, + i512mem, memopv8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, + i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, + i512mem, memopv8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, + i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, + i512mem, memopv8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 9f1c999cdd8..b23da040d9a 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -149,6 +149,9 @@ def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; +def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll new file mode 100644 index 00000000000..8cdcf8ad062 --- /dev/null +++ b/test/CodeGen/X86/avx512-shift.ll @@ -0,0 +1,108 @@ +;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +;CHECK-LABEL: shift_16_i32 +;CHECK: vpsrld +;CHECK: vpslld +;CHECK: vpsrad +;CHECK: ret +define <16 x i32> @shift_16_i32(<16 x i32> %a) { + %b = lshr <16 x i32> %a, + %c = shl <16 x i32> %b, + %d = ashr <16 x i32> %c, + ret <16 x i32> %d; +} + +;CHECK-LABEL: shift_8_i64 +;CHECK: vpsrlq +;CHECK: vpsllq +;CHECK: vpsraq +;CHECK: ret +define <8 x i64> @shift_8_i64(<8 x i64> %a) { + %b = lshr <8 x i64> %a, + %c = shl <8 x i64> %b, + %d = ashr <8 x i64> %c, + ret <8 x i64> %d; +} + +; CHECK-LABEL: variable_shl4 +; CHECK: vpsllvq %zmm +; CHECK: ret +define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) { + %k = shl <8 x i64> %x, %y + ret <8 x i64> %k +} + +; CHECK-LABEL: variable_shl5 +; CHECK: vpsllvd %zmm +; CHECK: ret +define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) { + %k = shl <16 x i32> %x, %y + ret <16 x i32> %k +} + +; CHECK-LABEL: variable_srl0 +; CHECK: vpsrlvd +; CHECK: ret +define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) { + %k = lshr <16 x i32> %x, %y + ret <16 x i32> %k +} + +; CHECK-LABEL: variable_srl2 +; CHECK: psrlvq +; CHECK: ret +define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) { + %k = lshr <8 x i64> %x, %y + ret <8 x i64> %k +} + +; CHECK-LABEL: variable_sra1 +; CHECK: vpsravd +; CHECK: ret +define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) { + %k = ashr <16 x i32> %x, %y + ret <16 x i32> %k +} + +; CHECK-LABEL: variable_sra2 +; CHECK: vpsravq %zmm +; CHECK: ret +define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) { + %k = ashr <8 x i64> %x, %y + ret <8 x i64> %k +} + +; CHECK-LABEL: variable_sra01_load +; CHECK: vpsravd (% +; CHECK: ret +define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) { + %y1 = load <16 x i32>* %y + %k = ashr <16 x i32> %x, %y1 + ret <16 x i32> %k +} + +; CHECK-LABEL: variable_shl1_load +; CHECK: vpsllvd (% +; CHECK: ret +define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) { + %y1 = load <16 x i32>* %y + %k = shl <16 x i32> %x, %y1 + ret <16 x i32> %k +} +; CHECK: variable_srl0_load +; CHECK: vpsrlvd (% +; CHECK: ret +define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) { + %y1 = load <16 x i32>* %y + %k = lshr <16 x i32> %x, %y1 + ret <16 x i32> %k +} + +; CHECK: variable_srl3_load +; CHECK: vpsrlvq (% +; CHECK: ret +define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) { + %y1 = load <8 x i64>* %y + %k = lshr <8 x i64> %x, %y1 + ret <8 x i64> %k +} diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index 26a77c19062..35efd405871 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -19,3 +19,19 @@ vextracti64x4 $1, %zmm9, %ymm17 // CHECK: vextracti64x4 // CHECK: encoding: [0x62,0x73,0xfd,0x48,0x3b,0x4f,0x10,0x01] vextracti64x4 $1, %zmm9, 512(%rdi) + +// CHECK: vpsrad +// CHECK: encoding: [0x62,0xb1,0x35,0x40,0x72,0xe1,0x02] +vpsrad $2, %zmm17, %zmm25 + +// CHECK: vpsrad +// CHECK: encoding: [0x62,0xf1,0x35,0x40,0x72,0x64,0xb7,0x08,0x02] +vpsrad $2, 512(%rdi, %rsi, 4), %zmm25 + +// CHECK: vpsrad +// CHECK: encoding: [0x62,0x21,0x1d,0x48,0xe2,0xc9] +vpsrad %xmm17, %zmm12, %zmm25 + +// CHECK: vpsrad +// CHECK: encoding: [0x62,0x61,0x1d,0x48,0xe2,0x4c,0xb7,0x20] +vpsrad 512(%rdi, %rsi, 4), %zmm12, %zmm25