From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 24 Mar 2015 20:36:42 +0000 (+0000)
Subject: [X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=5e0ce9d13ad5b39c12b68e7af6cf946414bff170;p=oota-llvm.git

[X86, AVX] instcombine vperm2 intrinsics with zero inputs into shuffles

This is the IR optimizer follow-on patch for D8563: the x86 backend patch
that converts this kind of shuffle back into a vperm2.

This is also a continuation of the transform that started in D8486.
In that patch, Andrea suggested that we could convert vperm2 intrinsics that
use zero masks into a single shuffle.

This is an implementation of that suggestion.

Differential Revision: http://reviews.llvm.org/D8567



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233110 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index b59c9f5d910..8f7825a8664 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -204,7 +204,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
     VectorType *VecTy = cast<VectorType>(II.getType());
-    uint8_t Imm = CInt->getZExtValue();
+    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
 
     // The immediate permute control byte looks like this:
     //    [1:0] - select 128 bits from sources for low half of destination
@@ -213,37 +213,51 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,
     //    [5:4] - select 128 bits from sources for high half of destination
     //    [6]   - ignore
     //    [7]   - zero high half of destination
-    
-    if ((Imm & 0x88) == 0x88) {
-      // If both zero mask bits are set, this was just a weird way to
-      // generate a zero vector.
-      return ConstantAggregateZero::get(VecTy);
-    }
 
-    // TODO: If a single zero bit is set, replace one of the source operands
-    // with a zero vector and use the same mask generation logic as below.
+    uint8_t Imm = CInt->getZExtValue();
+
+    bool LowHalfZero = Imm & 0x08;
+    bool HighHalfZero = Imm & 0x80;
 
-    if ((Imm & 0x88) == 0x00) {
-      // If neither zero mask bit is set, this is a simple shuffle.
-      unsigned NumElts = VecTy->getNumElements();
-      unsigned HalfSize = NumElts / 2;
-      unsigned HalfBegin;
-      SmallVector<int, 8> ShuffleMask(NumElts);
+    // If both zero mask bits are set, this was just a weird way to
+    // generate a zero vector.
+    if (LowHalfZero && HighHalfZero)
+      return ZeroVector;
 
-      // Permute low half of result.
-      HalfBegin = (Imm & 0x3) * HalfSize;
-      for (unsigned i = 0; i != HalfSize; ++i)
-        ShuffleMask[i] = HalfBegin + i;
+    // If 0 or 1 zero mask bits are set, this is a simple shuffle.
+    unsigned NumElts = VecTy->getNumElements();
+    unsigned HalfSize = NumElts / 2;
+    SmallVector<int, 8> ShuffleMask(NumElts);
+
+    // The high bit of the selection field chooses the 1st or 2nd operand.
+    bool LowInputSelect = Imm & 0x02;
+    bool HighInputSelect = Imm & 0x20;
     
-      // Permute high half of result.
-      HalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
-      for (unsigned i = HalfSize; i != NumElts; ++i)
-        ShuffleMask[i] = HalfBegin + i - HalfSize;
-
-      Value *Op0 = II.getArgOperand(0);
-      Value *Op1 = II.getArgOperand(1);
-      return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask);
-    }
+    // The low bit of the selection field chooses the low or high half
+    // of the selected operand.
+    bool LowHalfSelect = Imm & 0x01;
+    bool HighHalfSelect = Imm & 0x10;
+
+    // Determine which operand(s) are actually in use for this instruction.
+    Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+    Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+    
+    // If needed, replace operands based on zero mask.
+    V0 = LowHalfZero ? ZeroVector : V0;
+    V1 = HighHalfZero ? ZeroVector : V1;
+    
+    // Permute low half of result.
+    unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
+    for (unsigned i = 0; i < HalfSize; ++i)
+      ShuffleMask[i] = StartIndex + i;
+
+    // Permute high half of result.
+    StartIndex = HighHalfSelect ? HalfSize : 0;
+    StartIndex += NumElts;
+    for (unsigned i = 0; i < HalfSize; ++i)
+      ShuffleMask[i + HalfSize] = StartIndex + i;
+
+    return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
   }
   return nullptr;
 }
diff --git a/test/Transforms/InstCombine/x86-vperm2.ll b/test/Transforms/InstCombine/x86-vperm2.ll
index 92cc4afefa7..864296dd189 100644
--- a/test/Transforms/InstCombine/x86-vperm2.ll
+++ b/test/Transforms/InstCombine/x86-vperm2.ll
@@ -76,7 +76,7 @@ define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
   ret <4 x double> %res
 
 ; CHECK-LABEL: @perm2pd_0x02
-; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double> %1
 }
 
@@ -85,7 +85,7 @@ define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
   ret <4 x double> %res
 
 ; CHECK-LABEL: @perm2pd_0x03
-; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double> %1
 }
 
@@ -111,7 +111,7 @@ define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
   ret <4 x double> %res
 
 ; CHECK-LABEL: @perm2pd_0x12
-; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  ret <4 x double> %1
 }
 
@@ -120,7 +120,7 @@ define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
   ret <4 x double> %res
 
 ; CHECK-LABEL: @perm2pd_0x13
-; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:  ret <4 x double> %1
 }
 
@@ -207,26 +207,41 @@ define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
 }
 
 
-; Confirm that when a single zero mask bit is set, we do nothing.
+; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
+
+define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
+  ret <4 x double> %res
+
+; CHECK-LABEL: @perm2pd_0x81
+; CHECK-NEXT:  shufflevector <4 x double> %a0, <4 x double> <double 0.0{{.*}}<4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  ret <4 x double>
+}
 
 define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
   ret <4 x double> %res
 
 ; CHECK-LABEL: @perm2pd_0x83
-; CHECK-NEXT:  call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125)
+; CHECK-NEXT:  shufflevector <4 x double> %a1, <4 x double> <double 0.0{{.*}}, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double>
 }
 
+define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
+  ret <4 x double> %res
 
-; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect.
+; CHECK-LABEL: @perm2pd_0x28
+; CHECK-NEXT:  shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:  ret <4 x double>
+}
 
-define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) {
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72)
+define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
   ret <4 x double> %res
 
-; CHECK-LABEL: @perm2pd_0x48
-; CHECK-NEXT:  call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72)
+; CHECK-LABEL: @perm2pd_0x08
+; CHECK-NEXT:  shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double>
 }