From: Elena Demikhovsky Date: Thu, 19 Feb 2015 10:48:04 +0000 (+0000) Subject: AVX-512: Full implementation for VRNDSCALESS/SD instructions and intrinsics. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=675d06d1d0aa235091757c0711fb94fb252720f9 AVX-512: Full implementation for VRNDSCALESS/SD instructions and intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229837 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 0271310f5d6..60deb3288a0 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3193,12 +3193,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; def int_x86_avx512_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a07718b9bd8..d242597f75b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17523,9 +17523,20 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - SDValue RoundingMode = Op.getOperand(5); + // There are 2 kinds of intrinsics in this group: + // (1) With supress-all-exceptions (sae) - 6 operands + // (2) With rounding mode and sae - 7 operands. + if (Op.getNumOperands() == 6) { + SDValue Sae = Op.getOperand(5); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, + Sae), + Mask, Src0, Subtarget, DAG); + } + assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); + SDValue RoundingMode = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - RoundingMode), + RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 5d69c1fa188..9f94e9df875 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -393,7 +393,8 @@ namespace llvm { FMSUB_RND, FNMSUB_RND, FMADDSUB_RND, - FMSUBADD_RND, + FMSUBADD_RND, + RNDSCALE, // Compress and expand COMPRESS, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index dd5cc911b1e..5881da952e7 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -86,6 +86,8 @@ class X86VectorVTInfo("v" # !srl(Size, 5) # "i32"); @@ -4637,7 +4639,6 @@ let ExeDomain = d in { } // ExeDomain } - defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, loadv16f32, SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; @@ -4657,51 +4658,68 @@ def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), FROUND_CURRENT)), (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; -multiclass avx512_rndscale_scalar opc, string OpcodeStr, - Operand x86memop, RegisterClass RC, Domain d> { -let ExeDomain = d in { - def r : AVX512AIi8, EVEX_4V; +multiclass +avx512_rndscale_scalar opc, string OpcodeStr, X86VectorVTInfo _> { - def m : AVX512AIi8, EVEX_4V; -} // ExeDomain -} - -defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X, - SSEPackedSingle>, EVEX_CD8<32, CD8VT1>; + let ExeDomain = _.ExeDomain in { + defm r : AVX512_maskable_scalar; -defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X, - SSEPackedDouble>, EVEX_CD8<64, CD8VT1>; + defm rb : AVX512_maskable_scalar, EVEX_B; -let Predicates = [HasAVX512] in { - def : Pat<(ffloor FR32X:$src), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>; - def : Pat<(f64 (ffloor FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>; - def : Pat<(f32 (fnearbyint FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>; - def : Pat<(f64 (fnearbyint FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>; - def : Pat<(f32 (fceil FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>; - def : Pat<(f64 (fceil FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>; - def : Pat<(f32 (frint FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>; - def : Pat<(f64 (frint FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>; - def : Pat<(f32 (ftrunc FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>; - def : Pat<(f64 (ftrunc FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>; -} + let mayLoad = 1 in + defm m : AVX512_maskable_scalar; + } + def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>; + def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>; + def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>; + + def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x1))), _.FRC)>; + def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x3))), _.FRC)>; + def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0xc))), _.FRC)>; +} + +defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; def : Pat<(v16f32 (ffloor VR512:$src)), (VRNDSCALEPSZr VR512:$src, (i32 0x1))>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index f8590e59dbf..25058a25646 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -223,6 +223,8 @@ def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisVec<0>, SDTCisInt<2>]>; def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisVec<0>, SDTCisInt<3>]>; +def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; @@ -299,6 +301,7 @@ def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index d32b448c619..e4368116a67 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -378,6 +378,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::RNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, @@ -396,8 +400,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index b6375c1f618..8eb67c0a345 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -68,6 +68,14 @@ define <8 x double> @test7(<8 x double> %a) { ret <8 x double>%res } +declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32) + +define <2 x double> @test_rndsc_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; CHECK: vrndscalesd $11, %xmm{{.*}} {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x0b,0xd1,0x0b] + %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 5, i32 11, i32 4) + ret <2 x double>%res +} + declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) define <16 x float> @test8(<16 x float> %a) { diff --git a/test/CodeGen/X86/avx512-round.ll b/test/CodeGen/X86/avx512-round.ll index 19d9f18b80a..ffeb2a85e91 100644 --- a/test/CodeGen/X86/avx512-round.ll +++ b/test/CodeGen/X86/avx512-round.ll @@ -79,3 +79,28 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %a) { ret <8 x double> %res } declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p) + +define double @nearbyint_f64(double %a) { +; CHECK-LABEL: nearbyint_f64 +; CHECK: vrndscalesd $12, {{.*}}encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc0,0x0c] + %res = call double @llvm.nearbyint.f64(double %a) + ret double %res +} +declare double @llvm.nearbyint.f64(double %p) + +define float @floor_f32(float %a) { +; CHECK-LABEL: floor_f32 +; CHECK: vrndscaless $1, {{.*}}encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc0,0x01] + %res = call float @llvm.floor.f32(float %a) + ret float %res +} +declare float @llvm.floor.f32(float %p) + +define float @floor_f32m(float* %aptr) { +; CHECK-LABEL: floor_f32m +; CHECK: vrndscaless $1, (%rdi), {{.*}}encoding: [0x62,0xf3,0x7d,0x08,0x0a,0x07,0x01] + %a = load float* %aptr, align 4 + %res = call float @llvm.floor.f32(float %a) + ret float %res +} +