From: Nadav Rotem Date: Mon, 2 Jan 2012 08:05:46 +0000 (+0000) Subject: Optimize the sequence blend(sign_extend(x)) to blend(shl(x)) since SSE blend instruct... X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=a46f35d3d65425af5eaaaf906fca240a33d6c362;p=oota-llvm.git Optimize the sequence blend(sign_extend(x)) to blend(shl(x)) since SSE blend instructions only look at the highest bit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147426 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2d9dbd29c08..baf39c270d0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13133,6 +13133,24 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // The VSELECT instruction is lowered to SSE blend instructions. In many cases + // the mask is sign-extended to fill the entire lane. However, we only care + // for the highest bit. Convert sign_extend to srl because it is cheaper. + // (vselect(sign_extend(x))) -> vselect(srl(x)) + if (N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::SIGN_EXTEND_INREG && Cond.hasOneUse()) { + EVT CondVT = Cond.getValueType(); + EVT SExtTy = cast(Cond.getOperand(1))->getVT(); + unsigned BitsDiff = CondVT.getScalarType().getSizeInBits() - + SExtTy.getScalarType().getSizeInBits(); + + EVT ShiftType = EVT::getVectorVT(*DAG.getContext(), + MVT::i32, CondVT.getVectorNumElements()); + SDValue SHL = DAG.getNode(ISD::SHL, DL, CondVT, Cond.getOperand(0), + DAG.getConstant(BitsDiff, ShiftType)); + return DAG.getNode(ISD::VSELECT, DL, VT, SHL, LHS, RHS); + } + return SDValue(); } diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll index dbc122ac6e4..fc1b83b2224 100644 --- a/test/CodeGen/X86/2011-12-28-vselecti8.ll +++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll @@ -5,8 +5,10 @@ target triple = "x86_64-apple-darwin11.2.0" ; CHECK: @foo8 ; CHECK: psll -; CHECK: psraw -; CHECK: pblendvb +; CHECK-NOT: sra +; CHECK: pandn +; CHECK: pand +; CHECK: or ; CHECK: ret define void @foo8(float* nocapture %RET) nounwind { allocas: diff --git a/test/CodeGen/X86/sext-blend.ll b/test/CodeGen/X86/sext-blend.ll new file mode 100644 index 00000000000..b1f9573f307 --- /dev/null +++ b/test/CodeGen/X86/sext-blend.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s + +; CHECK: foo +define <4 x double> @foo(<4 x double> %x, <4 x double> %y) { + ; CHECK: cmpnlepd + ; CHECK: psllq + ; CHECK-NEXT: blendvpd + ; CHECK: psllq + ; CHECK-NEXT: blendvpd + ; CHECK: ret + %min_is_x = fcmp ult <4 x double> %x, %y + %min = select <4 x i1> %min_is_x, <4 x double> %x, <4 x double> %y + ret <4 x double> %min +} + diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll index 2f4317bf294..c6602d3a3b4 100644 --- a/test/CodeGen/X86/sse2-blend.ll +++ b/test/CodeGen/X86/sse2-blend.ll @@ -28,10 +28,10 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) { ; Without forcing instructions, fall back to the preferred PS domain. ; CHECK: vsel_i64 -; CHECK: xorps -; CHECK: andps -; CHECK: andnps -; CHECK: orps +; CHECK: pxor +; CHECK: and +; CHECK: andn +; CHECK: or ; CHECK: ret define void@vsel_i64(<4 x i64>* %v1, <4 x i64>* %v2) { @@ -44,10 +44,10 @@ define void@vsel_i64(<4 x i64>* %v1, <4 x i64>* %v2) { ; Without forcing instructions, fall back to the preferred PS domain. ; CHECK: vsel_double -; CHECK: xorps -; CHECK: andps -; CHECK: andnps -; CHECK: orps +; CHECK: xor +; CHECK: and +; CHECK: andn +; CHECK: or ; CHECK: ret define void@vsel_double(<4 x double>* %v1, <4 x double>* %v2) { diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll index 78604a0e963..0a71dd0d04d 100644 --- a/test/CodeGen/X86/sse41-blend.ll +++ b/test/CodeGen/X86/sse41-blend.ll @@ -36,6 +36,7 @@ define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { ;CHECK: vsel_double +;CHECK-NOT: sra ;CHECK: blendvpd ;CHECK: ret define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) { @@ -54,6 +55,7 @@ define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) { ;CHECK: vsel_i8 +;CHECK-NOT: sra ;CHECK: pblendvb ;CHECK: ret define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { @@ -65,6 +67,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; CHECK: A define <2 x double> @A(<2 x double> %x, <2 x double> %y) { ; CHECK: cmplepd + ; CHECK-NOT: sra ; CHECK: blendvpd %max_is_x = fcmp oge <2 x double> %x, %y %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y @@ -74,6 +77,7 @@ define <2 x double> @A(<2 x double> %x, <2 x double> %y) { ; CHECK: B define <2 x double> @B(<2 x double> %x, <2 x double> %y) { ; CHECK: cmpnlepd + ; CHECK-NOT: sra ; CHECK: blendvpd %min_is_x = fcmp ult <2 x double> %x, %y %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y