From b34c79e4bbe5accbb54d0291e8bef5d2bfef32e4 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 20 Feb 2013 15:50:31 +0000 Subject: [PATCH] Fix PR15155: lost vadd/vsplat optimization. During lowering of a BUILD_VECTOR, we look for opportunities to use a vector splat. When the splatted value fits in 5 signed bits, a single splat does the job. When it doesn't fit in 5 bits but does fit in 6, and is an even value, we can splat on half the value and add the result to itself. This last optimization hasn't been working recently because of improved constant folding. To circumvent this, create a pseudo VADD_SPLAT that can be expanded during instruction selection. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@175632 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 30 ++++++++++ lib/Target/PowerPC/PPCISelLowering.cpp | 16 +++--- lib/Target/PowerPC/PPCISelLowering.h | 5 ++ test/CodeGen/PowerPC/vaddsplat.ll | 77 ++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 8 deletions(-) create mode 100644 test/CodeGen/PowerPC/vaddsplat.ll diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 0f943e82bad..01d731a8095 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1322,6 +1322,36 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, SDValue(Tmp, 0), GA); } + case PPCISD::VADD_SPLAT: { + // Convert: VADD_SPLAT elt, size + // Into: tmp = VSPLTIS[BHW] elt + // VADDU[BHW]M tmp, tmp + // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4 + assert(isa(N->getOperand(0)) && + isa(N->getOperand(1)) && + "Invalid operand on VADD_SPLAT!"); + int EltSize = N->getConstantOperandVal(1); + unsigned Opc1, Opc2; + EVT VT; + if (EltSize == 1) { + Opc1 = PPC::VSPLTISB; + Opc2 = PPC::VADDUBM; + VT = MVT::v16i8; + } else if (EltSize == 2) { + Opc1 = PPC::VSPLTISH; + Opc2 = PPC::VADDUHM; + VT = MVT::v8i16; + } else { + assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!"); + Opc1 = PPC::VSPLTISW; + Opc2 = PPC::VADDUWM; + VT = MVT::v4i32; + } + SDValue Elt = getI32Imm(N->getConstantOperandVal(0)); + SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, Elt); + SDValue TmpVal = SDValue(Tmp, 0); + return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal); + } } return SelectCode(N); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index e789112de21..f3ef38a3dca 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -594,6 +594,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; + case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; } } @@ -5020,14 +5021,13 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // If this value is in the range [-32,30] and is even, use: // tmp = VSPLTI[bhw], result = add tmp, tmp if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) { - // FIXME: This is currently disabled because the ADD will be folded back - // into an invalid BUILD_VECTOR immediately. - return SDValue(); -#if 0 - SDValue Res = BuildSplatI(SextVal >> 1, SplatSize, MVT::Other, DAG, dl); - Res = DAG.getNode(ISD::ADD, dl, Res.getValueType(), Res, Res); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); -#endif + // To avoid having the optimization undone by constant folding, we + // convert to a pseudo that will be expanded later. + SDValue Elt = DAG.getConstant(SextVal >> 1, MVT::i32); + EVT VT = Op.getValueType(); + int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4); + SDValue EltSize = DAG.getConstant(Size, MVT::i32); + return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); } // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 12b3df7c9a5..7cc2d1ac322 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -237,6 +237,11 @@ namespace llvm { /// sym@got@dtprel@l. ADDI_DTPREL_L, + /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded + /// into an ADD of a VSPLTI with itself during instruction selection. + /// Necessary to avoid losing this optimization due to constant folds. + VADD_SPLAT, + /// STD_32 - This is the STD instruction for use with "32-bit" registers. STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/test/CodeGen/PowerPC/vaddsplat.ll b/test/CodeGen/PowerPC/vaddsplat.ll new file mode 100644 index 00000000000..b4c16c1bf91 --- /dev/null +++ b/test/CodeGen/PowerPC/vaddsplat.ll @@ -0,0 +1,77 @@ +; RUN: llc -O0 -mcpu=pwr7 <%s | FileCheck %s + +; Test optimization of build_vector into vadd/vsplt for 6-bit immediates. + +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +%v4i32 = type <4 x i32> +%v8i16 = type <8 x i16> +%v16i8 = type <16 x i8> + +define void @test_v4i32_pos(%v4i32* %P, %v4i32* %S) { + %p = load %v4i32* %P + %r = add %v4i32 %p, < i32 18, i32 18, i32 18, i32 18 > + store %v4i32 %r, %v4i32* %S + ret void +} + +; CHECK: test_v4i32_pos: +; CHECK: vspltisw [[REG1:[0-9]+]], 9 +; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG1]] + +define void @test_v4i32_neg(%v4i32* %P, %v4i32* %S) { + %p = load %v4i32* %P + %r = add %v4i32 %p, < i32 -28, i32 -28, i32 -28, i32 -28 > + store %v4i32 %r, %v4i32* %S + ret void +} + +; CHECK: test_v4i32_neg: +; CHECK: vspltisw [[REG1:[0-9]+]], -14 +; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG1]] + +define void @test_v8i16_pos(%v8i16* %P, %v8i16* %S) { + %p = load %v8i16* %P + %r = add %v8i16 %p, < i16 30, i16 30, i16 30, i16 30, i16 30, i16 30, i16 30, i16 30 > + store %v8i16 %r, %v8i16* %S + ret void +} + +; CHECK: test_v8i16_pos: +; CHECK: vspltish [[REG1:[0-9]+]], 15 +; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG1]] + +define void @test_v8i16_neg(%v8i16* %P, %v8i16* %S) { + %p = load %v8i16* %P + %r = add %v8i16 %p, < i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32 > + store %v8i16 %r, %v8i16* %S + ret void +} + +; CHECK: test_v8i16_neg: +; CHECK: vspltish [[REG1:[0-9]+]], -16 +; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG1]] + +define void @test_v16i8_pos(%v16i8* %P, %v16i8* %S) { + %p = load %v16i8* %P + %r = add %v16i8 %p, < i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16 > + store %v16i8 %r, %v16i8* %S + ret void +} + +; CHECK: test_v16i8_pos: +; CHECK: vspltisb [[REG1:[0-9]+]], 8 +; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG1]] + +define void @test_v16i8_neg(%v16i8* %P, %v16i8* %S) { + %p = load %v16i8* %P + %r = add %v16i8 %p, < i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18 > + store %v16i8 %r, %v16i8* %S + ret void +} + +; CHECK: test_v16i8_neg: +; CHECK: vspltisb [[REG1:[0-9]+]], -9 +; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG1]] + -- 2.34.1