From: Sanjay Patel Date: Tue, 24 Mar 2015 20:36:42 +0000 (+0000) Subject: [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=5e0ce9d13ad5b39c12b68e7af6cf946414bff170;p=oota-llvm.git [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles This is the IR optimizer follow-on patch for D8563: the x86 backend patch that converts this kind of shuffle back into a vperm2. This is also a continuation of the transform that started in D8486. In that patch, Andrea suggested that we could convert vperm2 intrinsics that use zero masks into a single shuffle. This is an implementation of that suggestion. Differential Revision: http://reviews.llvm.org/D8567 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233110 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index b59c9f5d910..8f7825a8664 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -204,7 +204,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { if (auto CInt = dyn_cast(II.getArgOperand(2))) { VectorType *VecTy = cast(II.getType()); - uint8_t Imm = CInt->getZExtValue(); + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination @@ -213,37 +213,51 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination - - if ((Imm & 0x88) == 0x88) { - // If both zero mask bits are set, this was just a weird way to - // generate a zero vector. - return ConstantAggregateZero::get(VecTy); - } - // TODO: If a single zero bit is set, replace one of the source operands - // with a zero vector and use the same mask generation logic as below. + uint8_t Imm = CInt->getZExtValue(); + + bool LowHalfZero = Imm & 0x08; + bool HighHalfZero = Imm & 0x80; - if ((Imm & 0x88) == 0x00) { - // If neither zero mask bit is set, this is a simple shuffle. - unsigned NumElts = VecTy->getNumElements(); - unsigned HalfSize = NumElts / 2; - unsigned HalfBegin; - SmallVector ShuffleMask(NumElts); + // If both zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (LowHalfZero && HighHalfZero) + return ZeroVector; - // Permute low half of result. - HalfBegin = (Imm & 0x3) * HalfSize; - for (unsigned i = 0; i != HalfSize; ++i) - ShuffleMask[i] = HalfBegin + i; + // If 0 or 1 zero mask bits are set, this is a simple shuffle. + unsigned NumElts = VecTy->getNumElements(); + unsigned HalfSize = NumElts / 2; + SmallVector ShuffleMask(NumElts); + + // The high bit of the selection field chooses the 1st or 2nd operand. + bool LowInputSelect = Imm & 0x02; + bool HighInputSelect = Imm & 0x20; - // Permute high half of result. - HalfBegin = ((Imm >> 4) & 0x3) * HalfSize; - for (unsigned i = HalfSize; i != NumElts; ++i) - ShuffleMask[i] = HalfBegin + i - HalfSize; - - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask); - } + // The low bit of the selection field chooses the low or high half + // of the selected operand. + bool LowHalfSelect = Imm & 0x01; + bool HighHalfSelect = Imm & 0x10; + + // Determine which operand(s) are actually in use for this instruction. + Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); + Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); + + // If needed, replace operands based on zero mask. + V0 = LowHalfZero ? ZeroVector : V0; + V1 = HighHalfZero ? ZeroVector : V1; + + // Permute low half of result. + unsigned StartIndex = LowHalfSelect ? HalfSize : 0; + for (unsigned i = 0; i < HalfSize; ++i) + ShuffleMask[i] = StartIndex + i; + + // Permute high half of result. + StartIndex = HighHalfSelect ? HalfSize : 0; + StartIndex += NumElts; + for (unsigned i = 0; i < HalfSize; ++i) + ShuffleMask[i + HalfSize] = StartIndex + i; + + return Builder.CreateShuffleVector(V0, V1, ShuffleMask); } return nullptr; } diff --git a/test/Transforms/InstCombine/x86-vperm2.ll b/test/Transforms/InstCombine/x86-vperm2.ll index 92cc4afefa7..864296dd189 100644 --- a/test/Transforms/InstCombine/x86-vperm2.ll +++ b/test/Transforms/InstCombine/x86-vperm2.ll @@ -76,7 +76,7 @@ define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) { ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x02 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -85,7 +85,7 @@ define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) { ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x03 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -111,7 +111,7 @@ define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) { ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x12 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -120,7 +120,7 @@ define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) { ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x13 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -207,26 +207,41 @@ define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) { } -; Confirm that when a single zero mask bit is set, we do nothing. +; Confirm that when a single zero mask bit is set, we replace a source vector with zeros. + +define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x81 +; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x double> +; CHECK-NEXT: ret <4 x double> +} define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) { %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131) ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x83 -; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125) +; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x double> ; CHECK-NEXT: ret <4 x double> } +define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40) + ret <4 x double> %res -; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect. +; CHECK-LABEL: @perm2pd_0x28 +; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> +} -define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) { - %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8) ret <4 x double> %res -; CHECK-LABEL: @perm2pd_0x48 -; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +; CHECK-LABEL: @perm2pd_0x08 +; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> }