From: Michael J. Spencer Date: Thu, 24 Apr 2014 00:58:18 +0000 (+0000) Subject: [InstCombine][x86] Constant fold psll intrinsics. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=96363d50018da6e9cfa549695f01aa0d4d2d6a31 [InstCombine][x86] Constant fold psll intrinsics. This excludes avx512 as I don't have hardware to verify. It excludes _dq variants because they are represented in the IR as <{2,4} x i64> when it's actually a byte shift of the entire i{128,265}. This also excludes _dq_bs as they aren't at all supported by the backend. There are also no corresponding instructions in the ISA. I have no idea why they exist... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207058 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 5d4b063d5e5..40732d2f0b9 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -556,6 +556,47 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + // Constant fold << Ci. + // FIXME: We don't handle _dq because it's a shift of an i128, but is + // represented in the IR as <2 x i64>. A per element shift is wrong. + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: { + // Simplify if count is constant. To 0 if > BitWidth, otherwise to shl. + auto CDV = dyn_cast(II->getArgOperand(1)); + auto CInt = dyn_cast(II->getArgOperand(1)); + if (!CDV && !CInt) + break; + ConstantInt *Count; + if (CDV) + Count = cast(CDV->getElementAsConstant(0)); + else + Count = CInt; + + auto Vec = II->getArgOperand(0); + auto VT = cast(Vec->getType()); + if (Count->getZExtValue() > + VT->getElementType()->getPrimitiveSizeInBits() - 1) + return ReplaceInstUsesWith( + CI, ConstantAggregateZero::get(Vec->getType())); + else { + unsigned VWidth = VT->getNumElements(); + // Get a constant vector of the same type as the first operand. + auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); + return BinaryOperator::CreateShl( + Vec, Builder->CreateVectorSplat(VWidth, VTCI)); + } + break; + } case Intrinsic::x86_sse41_pmovsxbw: case Intrinsic::x86_sse41_pmovsxwd: diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index 70370196e15..35ba09313d6 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define i16 @test1(float %f) { entry: @@ -337,3 +338,112 @@ define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> ) ret <4 x double> %a } + +define <2 x i64> @test_sse2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 + +; CHECK: test_sse2_1 +; CHECK: ret <2 x i64> +} + +define <4 x i64> @test_avx2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_1 +; CHECK: ret <4 x i64> +} + +define <2 x i64> @test_sse2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 + +; CHECK: test_sse2_0 +; CHECK: ret <2 x i64> zeroinitializer +} + +define <4 x i64> @test_avx2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_0 +; CHECK: ret <4 x i64> zeroinitializer +} + +declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 +declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 + +attributes #1 = { nounwind readnone }