From 6ec3395335addb3d352881d84a93cae8a235af95 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Sat, 28 Dec 2013 11:11:52 +0000 Subject: [PATCH] [X86] Teach the backend how to fold target specific dag node for packed vector shift by immedate count (VSHLI/VSRLI/VSRAI) into a build_vector when the vector in input to the shift is a build_vector of all constants or UNDEFs. Target specific nodes for packed shifts by immediate count are in general introduced by function 'getTargetVShiftByConstNode' (in X86ISelLowering.cpp) when lowering shift operations, SSE/AVX immediate shift intrinsics and (only in very few cases) SIGN_EXTEND_INREG dag nodes. This patch adds extra rules for simplifying vector shifts inside function 'getTargetVShiftByConstNode'. Added file test/CodeGen/X86/vec_shift5.ll to verify that packed shifts by immediate are correctly folded into a build_vector when the input vector to the shift dag node is a vector of constants or undefs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@198113 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 55 +++++++++- test/CodeGen/X86/vec_shift5.ll | 160 +++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/vec_shift5.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3011407eb32..4fd74616590 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -11103,11 +11103,12 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { + EVT ElementType = VT.getVectorElementType(); // Check for ShiftAmt >= element width - if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { + if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) - ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; + ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, VT); } @@ -11115,6 +11116,56 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); + // Fold this packed vector shift into a build vector if SrcOp is a + // vector of ConstantSDNodes or UNDEFs. + if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { + SmallVector Elts; + unsigned NumElts = SrcOp->getNumOperands(); + ConstantSDNode *ND; + + switch(Opc) { + default: llvm_unreachable(0); + case X86ISD::VSHLI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType)); + } + break; + case X86ISD::VSRLI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType)); + } + break; + case X86ISD::VSRAI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType)); + } + break; + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts); + } + return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); } diff --git a/test/CodeGen/X86/vec_shift5.ll b/test/CodeGen/X86/vec_shift5.ll new file mode 100644 index 00000000000..2e98003ae1c --- /dev/null +++ b/test/CodeGen/X86/vec_shift5.ll @@ -0,0 +1,160 @@ +; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s + +; Verify that we correctly fold target specific packed vector shifts by +; immediate count into a simple build_vector when the elements of the vector +; in input to the packed shift are all constants or undef. + +define <8 x i16> @test1() { + %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> , i32 3) + ret <8 x i16> %1 +} +; CHECK-LABEL: test1 +; CHECK-NOT: psll +; CHECK: movaps +; CHECK-NEXT: ret + +define <8 x i16> @test2() { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> , i32 3) + ret <8 x i16> %1 +} +; CHECK-LABEL: test2 +; CHECK-NOT: psrl +; CHECK: movaps +; CHECK-NEXT: ret + +define <8 x i16> @test3() { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> , i32 3) + ret <8 x i16> %1 +} +; CHECK-LABEL: test3 +; CHECK-NOT: psra +; CHECK: movaps +; CHECK-NEXT: ret + +define <4 x i32> @test4() { + %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> , i32 3) + ret <4 x i32> %1 +} +; CHECK-LABEL: test4 +; CHECK-NOT: psll +; CHECK: movaps +; CHECK-NEXT: ret + +define <4 x i32> @test5() { + %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> , i32 3) + ret <4 x i32> %1 +} +; CHECK-LABEL: test5 +; CHECK-NOT: psrl +; CHECK: movaps +; CHECK-NEXT: ret + +define <4 x i32> @test6() { + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> , i32 3) + ret <4 x i32> %1 +} +; CHECK-LABEL: test6 +; CHECK-NOT: psra +; CHECK: movaps +; CHECK-NEXT: ret + +define <2 x i64> @test7() { + %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> , i32 3) + ret <2 x i64> %1 +} +; CHECK-LABEL: test7 +; CHECK-NOT: psll +; CHECK: movaps +; CHECK-NEXT: ret + +define <2 x i64> @test8() { + %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> , i32 3) + ret <2 x i64> %1 +} +; CHECK-LABEL: test8 +; CHECK-NOT: psrl +; CHECK: movaps +; CHECK-NEXT: ret + +define <8 x i16> @test9() { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> , i32 3) + ret <8 x i16> %1 +} +; CHECK-LABEL: test9 +; CHECK-NOT: psra +; CHECK: movaps +; CHECK-NEXT: ret + +define <4 x i32> @test10() { + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> , i32 3) + ret <4 x i32> %1 +} +; CHECK-LABEL: test10 +; CHECK-NOT: psra +; CHECK: movaps +; CHECK-NEXT: ret + +define <2 x i64> @test11() { + %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> , i32 3) + ret <2 x i64> %1 +} +; CHECK-LABEL: test11 +; CHECK-NOT: psrl +; CHECK: movaps +; CHECK-NEXT: ret + +define <8 x i16> @test12() { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> , i32 3) + ret <8 x i16> %1 +} +; CHECK-LABEL: test12 +; CHECK-NOT: psra +; CHECK: movaps +; CHECK-NEXT: ret + +define <4 x i32> @test13() { + %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> , i32 3) + ret <4 x i32> %1 +} +; CHECK-LABEL: test13 +; CHECK-NOT: psrl +; CHECK: movaps +; CHECK-NEXT: ret + +define <8 x i16> @test14() { + %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> , i32 3) + ret <8 x i16> %1 +} +; CHECK-LABEL: test14 +; CHECK-NOT: psrl +; CHECK: movaps +; CHECK-NEXT: ret + +define <4 x i32> @test15() { + %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> , i32 3) + ret <4 x i32> %1 +} +; CHECK-LABEL: test15 +; CHECK-NOT: psll +; CHECK: movaps +; CHECK-NEXT: ret + +define <2 x i64> @test16() { + %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> , i32 3) + ret <2 x i64> %1 +} +; CHECK-LABEL: test16 +; CHECK-NOT: psll +; CHECK: movaps +; CHECK-NEXT: ret + + +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) +declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) +declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) +declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) + -- 2.34.1