From 05e61f71137f101e831877ee123fd4c74cfccb3c Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 16 Jun 2015 06:07:24 +0000 Subject: [PATCH] X86: optimized i64 vector multiply with constant When we multiply two 64-bit vectors, we extract lower and upper part and use the PMULUDQ instruction. When one of the operands is a constant, the upper part may be zero, we know this at compile time. Example: %a = mul <4 x i64> %b, <4 x i64> < i64 5, i64 5, i64 5, i64 5>. I'm checking the value of the upper part and prevent redundant "multiply", "shift" and "add" operations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239802 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 16 +++++++++---- test/CodeGen/X86/pmul.ll | 37 ++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6228e4a9672..6b76fbc02ed 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16530,6 +16530,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); + SDValue AhiBlo = Ahi; + SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; @@ -16539,11 +16541,15 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, Bhi = DAG.getBitcast(MulVT, Bhi); SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); - SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); - SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); - - AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); - AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + // After shifting right const values the result may be all-zero. + if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + } + if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + } SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 21463b8539d..dbe5bd646c7 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s --check-prefix=AVX2 define <16 x i8> @mul8c(<16 x i8> %i) nounwind { ; SSE2-LABEL: mul8c: @@ -75,10 +76,6 @@ define <2 x i64> @b(<2 x i64> %i) nounwind { ; ALL-NEXT: movdqa {{.*#+}} xmm1 = [117,117] ; ALL-NEXT: movdqa %xmm0, %xmm2 ; ALL-NEXT: pmuludq %xmm1, %xmm2 -; ALL-NEXT: pxor %xmm3, %xmm3 -; ALL-NEXT: pmuludq %xmm0, %xmm3 -; ALL-NEXT: psllq $32, %xmm3 -; ALL-NEXT: paddq %xmm3, %xmm2 ; ALL-NEXT: psrlq $32, %xmm0 ; ALL-NEXT: pmuludq %xmm1, %xmm0 ; ALL-NEXT: psllq $32, %xmm0 @@ -248,3 +245,35 @@ entry: %A = mul <2 x i64> %i, %j ret <2 x i64> %A } + +define <4 x i64> @b1(<4 x i64> %i) nounwind { +; AVX2-LABEL: @b1 +; AVX2: vpbroadcastq +; AVX2-NEXT: vpmuludq +; AVX2-NEXT: vpsrlq $32 +; AVX2-NEXT: vpmuludq +; AVX2-NEXT: vpsllq $32 +; AVX2-NEXT: vpaddq +; AVX2-NEXT: retq +entry: + %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 > + ret <4 x i64> %A +} + +define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind { +; AVX2-LABEL: @b2 +; AVX2: vpmuludq +; AVX2-NEXT: vpsrlq $32 +; AVX2-NEXT: vpmuludq +; AVX2-NEXT: vpsllq $32 +; AVX2-NEXT: vpaddq +; AVX2-NEXT: vpsrlq $32 +; AVX2-NEXT: vpmuludq +; AVX2-NEXT: vpsllq $32 +; AVX2-NEXT: vpaddq +; AVX2-NEXT: retq +entry: + %A = mul <4 x i64> %i, %j + ret <4 x i64> %A +} + -- 2.34.1