From 518acfca4462b5f42e526773ff329ee771033bd8 Mon Sep 17 00:00:00 2001 From: Asaf Badouh Date: Mon, 28 Dec 2015 08:09:25 +0000 Subject: [PATCH] [X86][AVX512] add fp scalar broadcast intrinsics Differential Revision: http://reviews.llvm.org/D15790 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256489 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 21 +++++++--- lib/Target/X86/X86InstrAVX512.td | 5 --- lib/Target/X86/X86IntrinsicsInfo.h | 10 +++++ test/CodeGen/X86/avx512-intrinsics.ll | 43 +++++++++++++------- test/CodeGen/X86/avx512vl-intrinsics.ll | 52 +++++++++++++++++++++++++ 5 files changed, 106 insertions(+), 25 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 08005844cea..1cf26107374 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -4956,16 +4956,25 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_vbroadcast_ss_512 : GCCBuiltin<"__builtin_ia32_vbroadcastss512">, Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>; - def int_x86_avx512_vbroadcast_ss_ps_512 : - GCCBuiltin<"__builtin_ia32_vbroadcastss_ps512">, - Intrinsic<[llvm_v16f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_broadcast_ss_ps_512 : + GCCBuiltin<"__builtin_ia32_broadcastss512">, + Intrinsic<[llvm_v16f32_ty], [llvm_v4f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_broadcast_ss_ps_256 : + GCCBuiltin<"__builtin_ia32_broadcastss256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_broadcast_ss_ps_128 : + GCCBuiltin<"__builtin_ia32_broadcastss128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_vbroadcast_sd_512 : GCCBuiltin<"__builtin_ia32_vbroadcastsd512">, Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>; - def int_x86_avx512_vbroadcast_sd_pd_512 : - GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd512">, - Intrinsic<[llvm_v8f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; + def int_x86_avx512_mask_broadcast_sd_pd_512 : + GCCBuiltin<"__builtin_ia32_broadcastsd512">, + Intrinsic<[llvm_v8f64_ty], [llvm_v2f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_broadcast_sd_pd_256 : + GCCBuiltin<"__builtin_ia32_broadcastsd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_pbroadcastb_128 : GCCBuiltin<"__builtin_ia32_pbroadcastb128_mask">, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 66316119498..8bf2925a75d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1102,11 +1102,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; -def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZr VR128X:$src)>; -def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZr VR128X:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 5806985b5ce..8acd0c5df86 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -455,6 +455,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK, X86ISD::SUBV_BROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK, diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 4a4032570e7..0aed97d1e8f 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -514,25 +514,40 @@ define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) { } declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly -define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0) { +define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float> %a0) ; <<16 x float>> [#uses=1] - ret <16 x float> %res +; CHECK: kmovw %edi, %k1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) + %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 } -declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float>) nounwind readonly +declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly + -define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0) { +define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double> %a0) ; <<8 x double>> [#uses=1] - ret <8 x double> %res +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) + %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) + %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res2, %res3 + ret <8 x double> %res4 } -declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double>) nounwind readonly +declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512: diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index c63cca17391..3bdbf808743 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -5711,3 +5711,55 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { +; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) + %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) + %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} +declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly + +define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 + + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) + %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) + %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} +declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly + +define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: +; CHECK: kmovw %eax, %k1 +; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 + + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} +declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly + -- 2.34.1