From 62dfc511524b28a9411f62e19d48120066c1e41b Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 5 Jan 2012 11:05:55 +0000 Subject: [PATCH] Prevent a DAGCombine from firing where there are two uses of a combined-away node and the result of the combine isn't substantially smaller than the input, it's just canonicalized. This is the first part of a significant (7%) performance gain for Snappy's hot decompression loop. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147604 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +++- test/CodeGen/X86/shift-folding.ll | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8a8f0cf2bc7..5b2d86f511d 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3331,7 +3331,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or // (and (srl x, (sub c1, c2), MASK) - if (N1C && N0.getOpcode() == ISD::SRL && + // Only fold this if the inner shift has no other uses -- if it does, folding + // this will increase the total number of instructions. + if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && N0.getOperand(1).getOpcode() == ISD::Constant) { uint64_t c1 = cast(N0.getOperand(1))->getZExtValue(); if (c1 < VT.getSizeInBits()) { diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll index 7eac116e020..3ea601147bb 100644 --- a/test/CodeGen/X86/shift-folding.ll +++ b/test/CodeGen/X86/shift-folding.ll @@ -48,3 +48,23 @@ entry: %tmp512 = lshr i32 %tmp4, 24 ret i32 %tmp512 } + +define i64 @test5(i16 %i, i32* %arr) { +; Ensure that we don't fold away shifts which have multiple uses, as they are +; just re-introduced for the second use. +; CHECK: test5: +; CHECK-NOT: shrl +; CHECK: shrl $11 +; CHECK-NOT: shrl +; CHECK: ret + +entry: + %i.zext = zext i16 %i to i32 + %index = lshr i32 %i.zext, 11 + %index.zext = zext i32 %index to i64 + %val.ptr = getelementptr inbounds i32* %arr, i64 %index.zext + %val = load i32* %val.ptr + %val.zext = zext i32 %val to i64 + %sum = add i64 %val.zext, %index.zext + ret i64 %sum +} -- 2.34.1