From: Hal Finkel Date: Fri, 12 Dec 2014 23:59:36 +0000 (+0000) Subject: [PowerPC] Add a DAGToDAG peephole to remove unnecessary zero-exts X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=4e703f82f251f30af75c7e6559398b90e076eebc;ds=sidebyside [PowerPC] Add a DAGToDAG peephole to remove unnecessary zero-exts On PPC64, we end up with lots of i32 -> i64 zero extensions, not only from all of the usual places, but also from the ABI, which specifies that values passed are zero extended. Almost all 32-bit PPC instructions in PPC64 mode are defined to do *something* to the higher-order bits, and for some instructions, that action clears those bits (thus providing a zero-extended result). This is especially common after rotate-and-mask instructions. Adding an additional instruction to zero-extend the results of these instructions is unnecessary. This PPCISelDAGToDAG peephole optimization examines these zero-extensions, and looks back through their operands to see if all instructions will implicitly zero extend their results. If so, we convert these instructions to their 64-bit variants (which is an internal change only, the actual encoding of these instructions is the same as the original 32-bit ones) and remove the unnecessary zero-extension (changing where the INSERT_SUBREG instructions are to make everything internally consistent). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224169 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 2e10fee3a93..cc3cf5fc236 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -205,6 +205,7 @@ private: SDNode *SelectSETCC(SDNode *N); void PeepholePPC64(); + void PeepholePPC64ZExt(); void PeepholeCROps(); bool AllUsersSelectZero(SDNode *N); @@ -1628,6 +1629,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() { PeepholePPC64(); PeepholeCROps(); + PeepholePPC64ZExt(); } // Check if all users of this node will become isel where the second operand @@ -2101,6 +2103,299 @@ void PPCDAGToDAGISel::PeepholeCROps() { } while (IsModified); } +// Gather the set of 32-bit operations that are known to have their +// higher-order 32 bits zero, where ToPromote contains all such operations. +static bool PeepholePPC64ZExtGather(SDValue Op32, + SmallPtrSetImpl &ToPromote) { + if (!Op32.isMachineOpcode()) + return false; + + // First, check for the "frontier" instructions (those that will clear the + // higher-order 32 bits. + + // For RLWINM and RLWNM, we need to make sure that the mask does not wrap + // around. If it does not, then these instructions will clear the + // higher-order bits. + if ((Op32.getMachineOpcode() == PPC::RLWINM || + Op32.getMachineOpcode() == PPC::RLWNM) && + Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) { + ToPromote.insert(Op32.getNode()); + return true; + } + + // SLW and SRW always clear the higher-order bits. + if (Op32.getMachineOpcode() == PPC::SLW || + Op32.getMachineOpcode() == PPC::SRW) { + ToPromote.insert(Op32.getNode()); + return true; + } + + // For LI and LIS, we need the immediate to be positive (so that it is not + // sign extended). + if (Op32.getMachineOpcode() == PPC::LI || + Op32.getMachineOpcode() == PPC::LIS) { + if (!isUInt<15>(Op32.getConstantOperandVal(0))) + return false; + + ToPromote.insert(Op32.getNode()); + return true; + } + + // Next, check for those instructions we can look through. + + // Assuming the mask does not wrap around, then the higher-order bits are + // taken directly from the first operand. + if (Op32.getMachineOpcode() == PPC::RLWIMI && + Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) { + SmallPtrSet ToPromote1; + if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1)) + return false; + + ToPromote.insert(Op32.getNode()); + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + return true; + } + + // For OR, the higher-order bits are zero if that is true for both operands. + // For SELECT_I4, the same is true (but the relevant operand numbers are + // shifted by 1). + if (Op32.getMachineOpcode() == PPC::OR || + Op32.getMachineOpcode() == PPC::SELECT_I4) { + unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0; + SmallPtrSet ToPromote1; + if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1)) + return false; + if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1)) + return false; + + ToPromote.insert(Op32.getNode()); + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + return true; + } + + // For ORI and ORIS, we need the higher-order bits of the first operand to be + // zero, and also for the constant to be positive (so that it is not sign + // extended). + if (Op32.getMachineOpcode() == PPC::ORI || + Op32.getMachineOpcode() == PPC::ORIS) { + SmallPtrSet ToPromote1; + if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1)) + return false; + if (!isUInt<15>(Op32.getConstantOperandVal(1))) + return false; + + ToPromote.insert(Op32.getNode()); + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + return true; + } + + // The higher-order bits of AND are zero if that is true for at least one of + // the operands. + if (Op32.getMachineOpcode() == PPC::AND) { + SmallPtrSet ToPromote1, ToPromote2; + bool Op0OK = + PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); + bool Op1OK = + PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2); + if (!Op0OK && !Op1OK) + return false; + + ToPromote.insert(Op32.getNode()); + + if (Op0OK) + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + + if (Op1OK) + ToPromote.insert(ToPromote2.begin(), ToPromote2.end()); + + return true; + } + + // For ANDI and ANDIS, the higher-order bits are zero if either that is true + // of the first operand, or if the second operand is positive (so that it is + // not sign extended). + if (Op32.getMachineOpcode() == PPC::ANDIo || + Op32.getMachineOpcode() == PPC::ANDISo) { + SmallPtrSet ToPromote1; + bool Op0OK = + PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); + bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1)); + if (!Op0OK && !Op1OK) + return false; + + ToPromote.insert(Op32.getNode()); + + if (Op0OK) + ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); + + return true; + } + + return false; +} + +void PPCDAGToDAGISel::PeepholePPC64ZExt() { + if (!PPCSubTarget->isPPC64()) + return; + + // When we zero-extend from i32 to i64, we use a pattern like this: + // def : Pat<(i64 (zext i32:$in)), + // (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32), + // 0, 32)>; + // There are several 32-bit shift/rotate instructions, however, that will + // clear the higher-order bits of their output, rendering the RLDICL + // unnecessary. When that happens, we remove it here, and redefine the + // relevant 32-bit operation to be a 64-bit operation. + + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = --Position; + // Skip dead nodes and any non-machine opcodes. + if (N->use_empty() || !N->isMachineOpcode()) + continue; + + if (N->getMachineOpcode() != PPC::RLDICL) + continue; + + if (N->getConstantOperandVal(1) != 0 || + N->getConstantOperandVal(2) != 32) + continue; + + SDValue ISR = N->getOperand(0); + if (!ISR.isMachineOpcode() || + ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG) + continue; + + if (!ISR.hasOneUse()) + continue; + + if (ISR.getConstantOperandVal(2) != PPC::sub_32) + continue; + + SDValue IDef = ISR.getOperand(0); + if (!IDef.isMachineOpcode() || + IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF) + continue; + + // We now know that we're looking at a canonical i32 -> i64 zext. See if we + // can get rid of it. + + SDValue Op32 = ISR->getOperand(1); + if (!Op32.isMachineOpcode()) + continue; + + // There are some 32-bit instructions that always clear the high-order 32 + // bits, there are also some instructions (like AND) that we can look + // through. + SmallPtrSet ToPromote; + if (!PeepholePPC64ZExtGather(Op32, ToPromote)) + continue; + + // If the ToPromote set contains nodes that have uses outside of the set + // (except for the original INSERT_SUBREG), then abort the transformation. + bool OutsideUse = false; + for (SDNode *PN : ToPromote) { + for (SDNode *UN : PN->uses()) { + if (!ToPromote.count(UN) && UN != ISR.getNode()) { + OutsideUse = true; + break; + } + } + + if (OutsideUse) + break; + } + if (OutsideUse) + continue; + + MadeChange = true; + + // We now know that this zero extension can be removed by promoting to + // nodes in ToPromote to 64-bit operations, where for operations in the + // frontier of the set, we need to insert INSERT_SUBREGs for their + // operands. + for (SDNode *PN : ToPromote) { + unsigned NewOpcode; + switch (PN->getMachineOpcode()) { + default: + llvm_unreachable("Don't know the 64-bit variant of this instruction"); + case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break; + case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break; + case PPC::SLW: NewOpcode = PPC::SLW8; break; + case PPC::SRW: NewOpcode = PPC::SRW8; break; + case PPC::LI: NewOpcode = PPC::LI8; break; + case PPC::LIS: NewOpcode = PPC::LIS8; break; + case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break; + case PPC::OR: NewOpcode = PPC::OR8; break; + case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break; + case PPC::ORI: NewOpcode = PPC::ORI8; break; + case PPC::ORIS: NewOpcode = PPC::ORIS8; break; + case PPC::AND: NewOpcode = PPC::AND8; break; + case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break; + case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break; + } + + // Note: During the replacement process, the nodes will be in an + // inconsistent state (some instructions will have operands with values + // of the wrong type). Once done, however, everything should be right + // again. + + SmallVector Ops; + for (const SDValue &V : PN->ops()) { + if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 && + !isa(V)) { + SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) }; + SDNode *ReplOp = + CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V), + ISR.getNode()->getVTList(), ReplOpOps); + Ops.push_back(SDValue(ReplOp, 0)); + } else { + Ops.push_back(V); + } + } + + // Because all to-be-promoted nodes only have users that are other + // promoted nodes (or the original INSERT_SUBREG), we can safely replace + // the i32 result value type with i64. + + SmallVector NewVTs; + SDVTList VTs = PN->getVTList(); + for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i) + if (VTs.VTs[i] == MVT::i32) + NewVTs.push_back(MVT::i64); + else + NewVTs.push_back(VTs.VTs[i]); + + DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: "); + DEBUG(PN->dump(CurDAG)); + + CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops); + + DEBUG(dbgs() << "\nNew: "); + DEBUG(PN->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + } + + // Now we replace the original zero extend and its associated INSERT_SUBREG + // with the value feeding the INSERT_SUBREG (which has now been promoted to + // return an i64). + + DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: "); + DEBUG(N->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(Op32.getNode()->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + ReplaceUses(N, Op32.getNode()); + } + + if (MadeChange) + CurDAG->RemoveDeadNodes(); +} + void PPCDAGToDAGISel::PeepholePPC64() { // These optimizations are currently supported only for 64-bit SVR4. if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64()) diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 6293752ad5b..460e49e5724 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -547,6 +547,11 @@ defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS), defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS), "extsh", "$rA, $rS", IIC_IntSimple, [(set i64:$rA, (sext_inreg i64:$rS, i16))]>; + +defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "slw", "$rA, $rS, $rB", IIC_IntGeneral, []>; +defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "srw", "$rA, $rS, $rB", IIC_IntGeneral, []>; } // Interpretation64Bit // For fast-isel: @@ -645,7 +650,11 @@ defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA), "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, []>; -let isCommutable = 1 in { +defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA), + (ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME), + "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral, + []>; + // RLWIMI can be commuted if the rotate amount is zero. let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA), @@ -653,7 +662,6 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA), u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME", IIC_IntRotate, []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; -} let isSelect = 1 in def ISEL8 : AForm_4<31, 15, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index daf8790fefc..c743d66ee04 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -230,10 +230,12 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Normal instructions can be commuted the obvious way. if (MI->getOpcode() != PPC::RLWIMI && - MI->getOpcode() != PPC::RLWIMIo && - MI->getOpcode() != PPC::RLWIMI8 && - MI->getOpcode() != PPC::RLWIMI8o) + MI->getOpcode() != PPC::RLWIMIo) return TargetInstrInfo::commuteInstruction(MI, NewMI); + // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a + // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because + // changing the relative order of the mask operands might change what happens + // to the high-bits of the mask (and, thus, the result). // Cannot commute if it has a non-zero rotate count. if (MI->getOperand(3).getImm() != 0) diff --git a/test/CodeGen/PowerPC/rm-zext.ll b/test/CodeGen/PowerPC/rm-zext.ll new file mode 100644 index 00000000000..1e60d1198bc --- /dev/null +++ b/test/CodeGen/PowerPC/rm-zext.ll @@ -0,0 +1,32 @@ +; RUN: llc -mcpu=pwr7 < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind readnone +define signext i32 @foo(i32 signext %a) #0 { +entry: + %mul = mul nsw i32 %a, %a + %shr2 = lshr i32 %mul, 5 + ret i32 %shr2 + +; CHECK-LABEL @foo +; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32 +; CHECK: blr +} + +define zeroext i32 @test6(i32 zeroext %x) #0 { +entry: + %and = lshr i32 %x, 16 + %shr = and i32 %and, 255 + %and1 = shl i32 %x, 16 + %shl = and i32 %and1, 16711680 + %or = or i32 %shr, %shl + ret i32 %or + +; CHECK-LABEL @test6 +; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32 +; CHECK: blr +} + +attributes #0 = { nounwind readnone } +