From 335fc618739df457dd0fd1bf40c2e03f3f224f48 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 13 Aug 2015 07:39:03 +0000 Subject: [PATCH] [InstCombine] SSE/AVX vector shifts demanded shift amount bits Most SSE/AVX (non-constant) vector shift instructions only use the lower 64-bits of the 128-bit shift amount vector operand, this patch calls SimplifyDemandedVectorElts to optimize for this. I had to refactor some of my recent InstCombiner work on the vector shifts to avoid quite a bit of duplicate code, it means that SimplifyX86immshift now (re)decodes the type of shift. Differential Revision: http://reviews.llvm.org/D11938 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@244872 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCalls.cpp | 111 +++++++++---- .../InstCombine/x86-vector-shifts.ll | 148 ++++++++++++++++++ 2 files changed, 232 insertions(+), 27 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 9f366418ea3..1b9abfdacbf 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -198,8 +198,52 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { } static Value *SimplifyX86immshift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder, - bool LogicalShift, bool ShiftLeft) { + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + LogicalShift = false; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + LogicalShift = true; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + LogicalShift = true; ShiftLeft = true; + break; + } assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); // Simplify if count is constant. @@ -788,51 +832,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } // Constant fold ashr( , Ci ). - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: + // Constant fold lshr( , Ci ). + // Constant fold shl( , Ci ). case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, false, false)) - return ReplaceInstUsesWith(*II, V); - break; - - // Constant fold lshr( , Ci ). - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true, false)) + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + if (Value *V = SimplifyX86immshift(*II, *Builder)) return ReplaceInstUsesWith(*II, V); break; - // Constant fold shl( , Ci ). + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_sse2_psll_d: case Intrinsic::x86_sse2_psll_q: case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true, true)) + case Intrinsic::x86_avx2_psll_w: { + if (Value *V = SimplifyX86immshift(*II, *Builder)) return ReplaceInstUsesWith(*II, V); + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto ShiftAmt = II->getArgOperand(1); + auto ShiftType = cast(ShiftAmt->getType()); + assert(ShiftType->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = ShiftType->getNumElements(); + + APInt DemandedElts = APInt::getLowBitsSet(VWidth, VWidth / 2); + APInt UndefElts(VWidth, 0); + if (Value *V = + SimplifyDemandedVectorElts(ShiftAmt, DemandedElts, UndefElts)) { + II->setArgOperand(1, V); + return II; + } break; + } case Intrinsic::x86_sse41_pmovsxbd: case Intrinsic::x86_sse41_pmovsxbq: diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/x86-vector-shifts.ll index 95700f08d81..26581e0560b 100644 --- a/test/Transforms/InstCombine/x86-vector-shifts.ll +++ b/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -825,6 +825,154 @@ define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable { ret <4 x i64> %1 } +; +; Vector Demanded Bits +; + +define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_var +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_var +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_var +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_var +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + ; ; Constant Folding ; -- 2.34.1