From 2d31ad0715e5a541901ce7ea7c8ba4f1bf0165d7 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 30 Sep 2015 16:44:39 +0000 Subject: [PATCH] [InstCombine] Teach how to convert SSSE3/AVX2 byte shuffles to builtin shuffles if the shuffle mask is constant. This patch teaches InstCombiner how to convert a SSSE3/AVX2 byte shuffle to a builtin shuffle if the mask is constant. Converting byte shuffle intrinsic calls to builtin shuffles can help finding more opportunities for combining shuffles later on in selection dag. We may end up with byte shuffles with constant masks as the result of inlining. Differential Revision: http://reviews.llvm.org/D13252 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248913 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCalls.cpp | 41 +++ test/Transforms/InstCombine/x86-pshufb.ll | 267 ++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 test/Transforms/InstCombine/x86-pshufb.ll diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 33e26b12e01..8f3deacf248 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1163,6 +1163,47 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: { + // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant. + auto *V = II->getArgOperand(1); + auto *VTy = cast(V->getType()); + unsigned NumElts = VTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32) && + "Unexpected number of elements in shuffle mask!"); + // Initialize the resulting shuffle mask to all zeroes. + uint32_t Indexes[32] = {0}; + + if (auto *Mask = dyn_cast(V)) { + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + int8_t Index = Mask->getElementAsInteger(I); + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index is the least significant 4 bits of the + // shuffle control byte. + Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; + } + } else if (!isa(V)) + break; + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + for (unsigned I = 16; I < NumElts; ++I) + Indexes[I] += I & 0xF0; + + auto NewC = ConstantDataVector::get(V->getContext(), + makeArrayRef(Indexes, NumElts)); + auto V1 = II->getArgOperand(0); + auto V2 = Constant::getNullValue(II->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); + } + case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: case Intrinsic::x86_avx_vpermilvar_pd: diff --git a/test/Transforms/InstCombine/x86-pshufb.ll b/test/Transforms/InstCombine/x86-pshufb.ll new file mode 100644 index 00000000000..caaaed8910a --- /dev/null +++ b/test/Transforms/InstCombine/x86-pshufb.ll @@ -0,0 +1,267 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; Verify that instcombine is able to fold identity shuffles. + +define <16 x i8> @identity_test(<16 x i8> %InVec) { +; CHECK-LABEL: @identity_test +; CHECK: ret <16 x i8> %InVec + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @identity_test_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @identity_test_avx2 +; CHECK: ret <32 x i8> %InVec + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + + +; Verify that instcombine is able to fold byte shuffles with zero masks. + +define <16 x i8> @fold_to_zero_vector(<16 x i8> %InVec) { +; CHECK-LABEL: @fold_to_zero_vector +; CHECK: ret <16 x i8> zeroinitializer + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @fold_to_zero_vector_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @fold_to_zero_vector_avx2 +; CHECK: ret <32 x i8> zeroinitializer + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector +; with a shuffle mask of all zeroes. + +define <16 x i8> @splat_test(<16 x i8> %InVec) { +; CHECK-LABEL: @splat_test +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> zeroinitializer + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> zeroinitializer) + ret <16 x i8> %1 +} + +; In the test case below, elements in the low 128-bit lane of the result +; vector are equal to the lower byte of %InVec (shuffle index 0). +; Elements in the high 128-bit lane of the result vector are equal to +; the lower byte in the high 128-bit lane of %InVec (shuffle index 16). + +define <32 x i8> @splat_test_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @splat_test_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> zeroinitializer) + ret <32 x i8> %1 +} + +; Each of the byte shuffles in the following tests is equivalent to a blend between +; vector %InVec and a vector of all zeroes. + +define <16 x i8> @blend1(<16 x i8> %InVec) { +; CHECK-LABEL: @blend1 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend2(<16 x i8> %InVec) { +; CHECK-LABEL: @blend2 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend3(<16 x i8> %InVec) { +; CHECK-LABEL: @blend3 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend4(<16 x i8> %InVec) { +; CHECK-LABEL: @blend4 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend5(<16 x i8> %InVec) { +; CHECK-LABEL: @blend5 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @blend6(<16 x i8> %InVec) { +; CHECK-LABEL: @blend6 +; CHECK: shufflevector <16 x i8> %InVec, {{.*}}, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @blend1_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend1_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend2_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend2_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend3_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend3_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend4_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend4_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend5_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend5_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @blend6_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @blend6_avx2 +; CHECK: shufflevector <32 x i8> %InVec, {{.*}}, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; movq idiom. +define <16 x i8> @movq_idiom(<16 x i8> %InVec) { +; CHECK-LABEL: @movq_idiom +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> , <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @movq_idiom_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @movq_idiom_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> , <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; Vector permutations using byte shuffles. + +define <16 x i8> @permute1(<16 x i8> %InVec) { +; CHECK-LABEL: @permute1 +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @permute2(<16 x i8> %InVec) { +; CHECK-LABEL: @permute2 +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @permute1_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @permute1_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @permute2_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @permute2_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +; Test that instcombine correctly folds a pshufb with values that +; are not -128 and that are not encoded in four bits. + +define <16 x i8> @identity_test2_2(<16 x i8> %InVec) { +; CHECK-LABEL: @identity_test2_2 +; CHECK: ret <16 x i8> %InVec + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @identity_test_avx2_2(<32 x i8> %InVec) { +; CHECK-LABEL: @identity_test_avx2_2 +; CHECK: ret <32 x i8> %InVec + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <16 x i8> @fold_to_zero_vector_2(<16 x i8> %InVec) { +; CHECK-LABEL: @fold_to_zero_vector_2 +; CHECK: ret <16 x i8> zeroinitializer + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @fold_to_zero_vector_avx2_2(<32 x i8> %InVec) { +; CHECK-LABEL: @fold_to_zero_vector_avx2_2 +; CHECK: ret <32 x i8> zeroinitializer + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + +define <16 x i8> @permute3(<16 x i8> %InVec) { +; CHECK-LABEL: @permute3 +; CHECK: shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> + + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @permute3_avx2(<32 x i8> %InVec) { +; CHECK-LABEL: @permute3_avx2 +; CHECK: shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> + + %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> ) + ret <32 x i8> %1 +} + + +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) -- 2.34.1