From 5b209e84f498b0e98d7f92123eac50a651aa01e1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 5 Feb 2012 03:14:49 +0000 Subject: [PATCH] Add target specific node for PMULUDQ. Change patterns to use it and custom lower intrinsics to it. Use it instead of intrinsic to handle 64-bit vector multiplies. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149807 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 97 +++++++++---------------- lib/Target/X86/X86ISelLowering.h | 5 +- lib/Target/X86/X86InstrFragmentsSIMD.td | 4 + lib/Target/X86/X86InstrSSE.td | 32 ++++++-- 4 files changed, 69 insertions(+), 69 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 941f87bfc5a..4bdb7bf0feb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9426,6 +9426,10 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const } // Arithmetic intrinsics. + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_avx2_pmulu_dq: + return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: case Intrinsic::x86_avx_hadd_ps_256: @@ -10085,78 +10089,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && + "Only know how to lower V2I64/V4I64 multiply"); + DebugLoc dl = Op.getDebugLoc(); + // Ahi = psrlqi(a, 32); + // Bhi = psrlqi(b, 32); + // + // AloBlo = pmuludq(a, b); + // AloBhi = pmuludq(a, Bhi); + // AhiBlo = pmuludq(Ahi, b); + + // AloBhi = psllqi(AloBhi, 32); + // AhiBlo = psllqi(AhiBlo, 32); + // return AloBlo + AloBhi + AhiBlo; + SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - if (VT == MVT::v4i64) { - assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2"); + SDValue ShAmt = DAG.getConstant(32, MVT::i32); - // ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32); - // ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32); - // ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b ); - // ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi ); - // ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b ); - // - // AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 ); - // AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 ); - // return AloBlo + AloBhi + AhiBlo; - - SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, - DAG.getConstant(32, MVT::i32)); - SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, - DAG.getConstant(32, MVT::i32)); - SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), - A, B); - SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), - A, Bhi); - SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), - Ahi, B); - AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, - DAG.getConstant(32, MVT::i32)); - AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, - DAG.getConstant(32, MVT::i32)); - SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); - return Res; - } + SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); + SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); - assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + // Bit cast to 32-bit vectors for MULUDQ + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); + B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); + Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); + Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); - // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); - // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); - // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); - // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); - // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); - // - // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); - // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); - // return AloBlo + AloBhi + AhiBlo; + SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); + SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + + AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); + AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); - SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, - DAG.getConstant(32, MVT::i32)); - SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, - DAG.getConstant(32, MVT::i32)); - SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, B); - SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, Bhi); - SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - Ahi, B); - AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, - DAG.getConstant(32, MVT::i32)); - AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, - DAG.getConstant(32, MVT::i32)); SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); - return Res; + return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { @@ -11092,6 +11064,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 9689bcd5863..d12dfcf7a2d 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -219,7 +219,7 @@ namespace llvm { // VZEXT_MOVL - Vector move low and zero extend. VZEXT_MOVL, - // VZEXT_MOVL - Vector move low and sign extend. + // VSEXT_MOVL - Vector move low and sign extend. VSEXT_MOVL, // VSHL, VSRL - 128-bit vector logical left / right shift @@ -283,6 +283,9 @@ namespace llvm { VPERM2X128, VBROADCAST, + // PMULUDQ - Vector multiply packed unsigned doubleword integers + PMULUDQ, + // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded // with control flow. diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index f239509a9c5..bc9fcd1401e 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -109,6 +109,10 @@ def X86vpcomu : SDNode<"X86ISD::VPCOMU", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; +def X86pmuludq : SDNode<"X86ISD::PMULUDQ", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; + // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get // translated into one of the target nodes below during lowering. // Note: this is a work in progress... diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 34478a026d2..d9a599c1bda 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3530,6 +3530,26 @@ multiclass PDI_binop_rmi opc, bits<8> opc2, Format ImmForm, [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))]>; } +/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types +multiclass PDI_binop_rm2 opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI; + def rm : PDI; +} } // ExeDomain = SSEPackedInt // 128-bit Integer Arithmetic @@ -3553,6 +3573,8 @@ defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64, i128mem, 0, 0>, VEX_4V; defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64, i128mem, 0, 0>, VEX_4V; +defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, 1, 0>, VEX_4V; // Intrinsic forms defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, @@ -3575,8 +3597,6 @@ defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, VR128, memopv2i64, i128mem, 1, 0>, VEX_4V; defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, VR128, memopv2i64, i128mem, 1, 0>, VEX_4V; -defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq, - VR128, memopv2i64, i128mem, 1, 0>, VEX_4V; defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, VR128, memopv2i64, i128mem, 1, 0>, VEX_4V; defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, @@ -3614,6 +3634,8 @@ defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64, i256mem, 0, 0>, VEX_4V; defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64, i256mem, 0, 0>, VEX_4V; +defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, + VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; // Intrinsic forms defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b, @@ -3636,8 +3658,6 @@ defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w, VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w, VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; -defm VPMULUDQY : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_avx2_pmulu_dq, - VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd, VR256, memopv4i64, i256mem, 1, 0>, VEX_4V; defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b, @@ -3675,6 +3695,8 @@ defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64, i128mem>; defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64, i128mem>; +defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, 1>; // Intrinsic forms defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b, @@ -3697,8 +3719,6 @@ defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, VR128, memopv2i64, i128mem, 1>; defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, VR128, memopv2i64, i128mem, 1>; -defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, - VR128, memopv2i64, i128mem, 1>; defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, VR128, memopv2i64, i128mem, 1>; defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, -- 2.34.1