From: Simon Pilgrim Date: Sat, 12 Sep 2015 13:39:53 +0000 (+0000) Subject: [InstCombine] CVTPH2PS Vector Demanded Elements + Constant Folding X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=231ed70f86ce60e611ca9f57e2bd2b138fc19d12;p=oota-llvm.git [InstCombine] CVTPH2PS Vector Demanded Elements + Constant Folding Improved InstCombine support for CVTPH2PS (F16C half 2 float conversion): <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) - only uses the bottom 4 i16 elements for the conversion. Added constant folding support. Differential Revision: http://reviews.llvm.org/D12731 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247504 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index cc7d4be7a78..b0d898738a5 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -796,6 +796,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return new StoreInst(II->getArgOperand(0), Ptr); } break; + case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: @@ -809,6 +810,52 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; + case Intrinsic::x86_vcvtph2ps_128: + case Intrinsic::x86_vcvtph2ps_256: { + auto Arg = II->getArgOperand(0); + auto ArgType = cast(Arg->getType()); + auto RetType = cast(II->getType()); + unsigned ArgWidth = ArgType->getNumElements(); + unsigned RetWidth = RetType->getNumElements(); + assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); + assert(ArgType->isIntOrIntVectorTy() && + ArgType->getScalarSizeInBits() == 16 && + "CVTPH2PS input type should be 16-bit integer vector"); + assert(RetType->getScalarType()->isFloatTy() && + "CVTPH2PS output type should be 32-bit float vector"); + + // Constant folding: Convert to generic half to single conversion. + if (auto CIZero = dyn_cast(Arg)) + return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); + + if (auto CIHalf = dyn_cast(Arg)) { + auto VectorHalfAsShorts = Arg; + if (RetWidth < ArgWidth) { + SmallVector SubVecMask; + for (unsigned i = 0; i != RetWidth; ++i) + SubVecMask.push_back((int)i); + VectorHalfAsShorts = Builder->CreateShuffleVector( + Arg, UndefValue::get(ArgType), SubVecMask); + } + + auto VectorHalfType = + VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); + auto VectorHalfs = + Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType); + auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType); + return ReplaceInstUsesWith(*II, VectorFloats); + } + + // We only use the lowest lanes of the argument. + APInt DemandedElts = APInt::getLowBitsSet(ArgWidth, RetWidth); + APInt UndefElts(ArgWidth, 0); + if (Value *V = SimplifyDemandedVectorElts(Arg, DemandedElts, UndefElts)) { + II->setArgOperand(0, V); + return II; + } + break; + } + case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse_cvttss2si: diff --git a/test/Transforms/InstCombine/x86-f16c.ll b/test/Transforms/InstCombine/x86-f16c.ll new file mode 100644 index 00000000000..e10b339907e --- /dev/null +++ b/test/Transforms/InstCombine/x86-f16c.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) + +; +; Vector Demanded Bits +; + +; Only bottom 4 elements required. +define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) { +; CHECK-LABEL: @demand_vcvtph2ps_128 +; CHECK-NEXT: %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %A) +; CHECK-NEXT: ret <4 x float> %1 + %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> + %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1) + ret <4 x float> %2 +} + +; All 8 elements required. +define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) { +; CHECK-LABEL: @demand_vcvtph2ps_256 +; CHECK-NEXT: %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> +; CHECK-NEXT: %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1) +; CHECK-NEXT: ret <8 x float> %2 + %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1) + ret <8 x float> %2 +} + +; +; Constant Folding +; + +define <4 x float> @fold_vcvtph2ps_128() { +; CHECK-LABEL: @fold_vcvtph2ps_128 +; CHECK-NEXT: ret <4 x float> + %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> ) + ret <4 x float> %1 +} + +define <8 x float> @fold_vcvtph2ps_256() { +; CHECK-LABEL: @fold_vcvtph2ps_256 +; CHECK-NEXT: ret <8 x float> + %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> ) + ret <8 x float> %1 +} + +define <4 x float> @fold_vcvtph2ps_128_zero() { +; CHECK-LABEL: @fold_vcvtph2ps_128_zero +; CHECK-NEXT: ret <4 x float> zeroinitializer + %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> ) + ret <4 x float> %1 +} + +define <8 x float> @fold_vcvtph2ps_256_zero() { +; CHECK-LABEL: @fold_vcvtph2ps_256_zero +; CHECK-NEXT: ret <8 x float> zeroinitializer + %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> ) + ret <8 x float> %1 +}