From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 12 Sep 2015 13:39:53 +0000 (+0000)
Subject: [InstCombine] CVTPH2PS Vector Demanded Elements + Constant Folding
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=231ed70f86ce60e611ca9f57e2bd2b138fc19d12;p=oota-llvm.git

[InstCombine] CVTPH2PS Vector Demanded Elements + Constant Folding

Improved InstCombine support for CVTPH2PS (F16C half 2 float conversion):

<4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) - only uses the bottom 4 i16 elements for the conversion.

Added constant folding support.

Differential Revision: http://reviews.llvm.org/D12731

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247504 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cc7d4be7a78..b0d898738a5 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -796,6 +796,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
+
   case Intrinsic::x86_sse_storeu_ps:
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
@@ -809,6 +810,52 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
 
+  case Intrinsic::x86_vcvtph2ps_128:
+  case Intrinsic::x86_vcvtph2ps_256: {
+    auto Arg = II->getArgOperand(0);
+    auto ArgType = cast<VectorType>(Arg->getType());
+    auto RetType = cast<VectorType>(II->getType());
+    unsigned ArgWidth = ArgType->getNumElements();
+    unsigned RetWidth = RetType->getNumElements();
+    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
+    assert(ArgType->isIntOrIntVectorTy() &&
+           ArgType->getScalarSizeInBits() == 16 &&
+           "CVTPH2PS input type should be 16-bit integer vector");
+    assert(RetType->getScalarType()->isFloatTy() &&
+           "CVTPH2PS output type should be 32-bit float vector");
+
+    // Constant folding: Convert to generic half to single conversion.
+    if (auto CIZero = dyn_cast<ConstantAggregateZero>(Arg))
+      return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
+
+    if (auto CIHalf = dyn_cast<ConstantDataVector>(Arg)) {
+      auto VectorHalfAsShorts = Arg;
+      if (RetWidth < ArgWidth) {
+        SmallVector<int, 8> SubVecMask;
+        for (unsigned i = 0; i != RetWidth; ++i)
+          SubVecMask.push_back((int)i);
+        VectorHalfAsShorts = Builder->CreateShuffleVector(
+            Arg, UndefValue::get(ArgType), SubVecMask);
+      }
+
+      auto VectorHalfType =
+          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
+      auto VectorHalfs =
+          Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType);
+      auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType);
+      return ReplaceInstUsesWith(*II, VectorFloats);
+    }
+
+    // We only use the lowest lanes of the argument.
+    APInt DemandedElts = APInt::getLowBitsSet(ArgWidth, RetWidth);
+    APInt UndefElts(ArgWidth, 0);
+    if (Value *V = SimplifyDemandedVectorElts(Arg, DemandedElts, UndefElts)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
diff --git a/test/Transforms/InstCombine/x86-f16c.ll b/test/Transforms/InstCombine/x86-f16c.ll
new file mode 100644
index 00000000000..e10b339907e
--- /dev/null
+++ b/test/Transforms/InstCombine/x86-f16c.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+;
+; Vector Demanded Bits
+;
+
+; Only bottom 4 elements required.
+define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_128
+; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %A)
+; CHECK-NEXT: ret <4 x float> %1
+  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+  ret <4 x float> %2
+}
+
+; All 8 elements required.
+define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_256
+; CHECK-NEXT: %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+; CHECK-NEXT: ret <8 x float> %2
+  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+  ret <8 x float> %2
+}
+
+;
+; Constant Folding
+;
+
+define <4 x float> @fold_vcvtph2ps_128() {
+; CHECK-LABEL: @fold_vcvtph2ps_128
+; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00>
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256() {
+; CHECK-LABEL: @fold_vcvtph2ps_256
+; CHECK-NEXT: ret <8 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00, float 2.000000e+00, float 6.550400e+04, float -1.000000e+00, float -2.000000e+00>
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <8 x float> %1
+}
+
+define <4 x float> @fold_vcvtph2ps_128_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_128_zero
+; CHECK-NEXT: ret <4 x float> zeroinitializer
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_256_zero
+; CHECK-NEXT: ret <8 x float> zeroinitializer
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <8 x float> %1
+}