[InstCombine] Improve Vector Demanded Bits Through Bitcasts

author Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 29 Sep 2015 08:19:11 +0000 (08:19 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Tue, 29 Sep 2015 08:19:11 +0000 (08:19 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 29 Sep 2015 08:19:11 +0000 (08:19 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Tue, 29 Sep 2015 08:19:11 +0000 (08:19 +0000)
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

index c68a03157847e89a8602623c60d7d9bace206dd9..a1fbda7f7e6d344324b1b4570740300acbd85814 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1082,6 +1082,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
      if (!VTy) break;
      unsigned InVWidth = VTy->getNumElements();
      APInt InputDemandedElts(InVWidth, 0);
+    UndefElts2 = APInt(InVWidth, 0);
      unsigned Ratio;
  
      if (VWidth == InVWidth) {
@@ -1089,29 +1090,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
        // elements as are demanded of us.
        Ratio = 1;
        InputDemandedElts = DemandedElts;
-    } else if (VWidth > InVWidth) {
-      // Untested so far.
-      break;
-
-      // If there are more elements in the result than there are in the source,
-      // then an input element is live if any of the corresponding output
-      // elements are live.
-      Ratio = VWidth/InVWidth;
-      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an input element is live if any of the
+      // corresponding output elements are live.
+      Ratio = VWidth / InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
          if (DemandedElts[OutIdx])
-          InputDemandedElts.setBit(OutIdx/Ratio);
-      }
-    } else {
-      // Untested so far.
-      break;
-
-      // If there are more elements in the source than there are in the result,
-      // then an input element is live if the corresponding output element is
-      // live.
-      Ratio = InVWidth/VWidth;
+          InputDemandedElts.setBit(OutIdx / Ratio);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an input element is live if the
+      // corresponding output element is live.
+      Ratio = InVWidth / VWidth;
        for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if (DemandedElts[InIdx/Ratio])
+        if (DemandedElts[InIdx / Ratio])
            InputDemandedElts.setBit(InIdx);
+    } else {
+      // Unsupported so far.
+      break;
      }
  
      // div/rem demand all inputs, because they don't want divide by zero.
@@ -1122,24 +1119,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
        MadeChange = true;
      }
  
-    UndefElts = UndefElts2;
-    if (VWidth > InVWidth) {
-      llvm_unreachable("Unimp");
-      // If there are more elements in the result than there are in the source,
-      // then an output element is undef if the corresponding input element is
-      // undef.
+    if (VWidth == InVWidth) {
+      UndefElts = UndefElts2;
+    } else if ((VWidth % InVWidth) == 0) {
+      // If the number of elements in the output is a multiple of the number of
+      // elements in the input then an output element is undef if the
+      // corresponding input element is undef.
        for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
-        if (UndefElts2[OutIdx/Ratio])
+        if (UndefElts2[OutIdx / Ratio])
+          UndefElts.setBit(OutIdx);
+    } else if ((InVWidth % VWidth) == 0) {
+      // If the number of elements in the input is a multiple of the number of
+      // elements in the output then an output element is undef if all of the
+      // corresponding input elements are undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
+        if (SubUndef.countPopulation() == Ratio)
            UndefElts.setBit(OutIdx);
-    } else if (VWidth < InVWidth) {
+      }
+    } else {
        llvm_unreachable("Unimp");
-      // If there are more elements in the source than there are in the result,
-      // then a result element is undef if all of the corresponding input
-      // elements are undef.
-      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
-      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
-        if (!UndefElts2[InIdx])            // Not undef?
-          UndefElts.clearBit(InIdx/Ratio);    // Clear undef bit.
      }
      break;
    }
diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/x86-vector-shifts.ll

index d07d90de049cd564eb45ab736e1aef21de843dfc..59e445a40bef0e527ddd3ee237b580c166ea4999 100644 (file)
--- a/test/Transforms/InstCombine/x86-vector-shifts.ll
+++ b/test/Transforms/InstCombine/x86-vector-shifts.ll
@@ -838,6 +838,17 @@ define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) {
    ret <8 x i16> %2
  }
  
+define <8 x i16> @sse2_psra_w_var_bc(<8 x i16> %v, <2 x i64> %a) {
+; CHECK-LABEL: @sse2_psra_w_var_bc
+; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <8 x i16>
+; CHECK-NEXT: %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
+; CHECK-NEXT: ret <8 x i16> %2
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = bitcast <2 x i64> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %2)
+  ret <8 x i16> %3
+}
+
  define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
  ; CHECK-LABEL: @sse2_psra_d_var
  ; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
@@ -847,6 +858,17 @@ define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) {
    ret <4 x i32> %2
  }
  
+define <4 x i32> @sse2_psra_d_var_bc(<4 x i32> %v, <8 x i16> %a) {
+; CHECK-LABEL: @sse2_psra_d_var_bc
+; CHECK-NEXT: %1 = bitcast <8 x i16> %a to <4 x i32>
+; CHECK-NEXT: %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
+; CHECK-NEXT: ret <4 x i32> %2
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <8 x i16> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %2)
+  ret <4 x i32> %3
+}
+
  define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) {
  ; CHECK-LABEL: @avx2_psra_w_var
  ; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
@@ -901,6 +923,17 @@ define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) {
    ret <16 x i16> %2
  }
  
+define <16 x i16> @avx2_psrl_w_var_bc(<16 x i16> %v, <16 x i8> %a) {
+; CHECK-LABEL: @avx2_psrl_w_var_bc
+; CHECK-NEXT: %1 = bitcast <16 x i8> %a to <8 x i16>
+; CHECK-NEXT: %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
+; CHECK-NEXT: ret <16 x i16> %2
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %2)
+  ret <16 x i16> %3
+}
+
  define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
  ; CHECK-LABEL: @avx2_psrl_d_var
  ; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
@@ -910,6 +943,17 @@ define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) {
    ret <8 x i32> %2
  }
  
+define <8 x i32> @avx2_psrl_d_var_bc(<8 x i32> %v, <2 x i64> %a) {
+; CHECK-LABEL: @avx2_psrl_d_var_bc
+; CHECK-NEXT: %1 = bitcast <2 x i64> %a to <4 x i32>
+; CHECK-NEXT: %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
+; CHECK-NEXT: ret <8 x i32> %2
+  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %2 = bitcast <2 x i64> %1 to <4 x i32>
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %2)
+  ret <8 x i32> %3
+}
+
  define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) {
  ; CHECK-LABEL: @avx2_psrl_q_var
  ; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 29 Sep 2015 08:19:11 +0000 (08:19 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Tue, 29 Sep 2015 08:19:11 +0000 (08:19 +0000)
lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp		patch \| blob \| history
test/Transforms/InstCombine/x86-vector-shifts.ll		patch \| blob \| history