From: Yi Jiang Date: Mon, 21 Apr 2014 19:34:27 +0000 (+0000) Subject: ARM64: Combine shifts and uses from different basic block to bit-extract instruction X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=5d473a08317ce9f150701c1a3cdb2e3df10922fc;p=oota-llvm.git ARM64: Combine shifts and uses from different basic block to bit-extract instruction git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206774 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 2e4956f16d0..c17416dfee9 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -182,6 +182,9 @@ public: return HasMultipleConditionRegisters; } + /// Return true if the target has BitExtract instructions. + bool hasExtractBitsInsn() const { return HasExtractBitsInsn; } + /// Return true if a vector of the given type should be split /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type /// legalization. @@ -1010,6 +1013,14 @@ protected: HasMultipleConditionRegisters = hasManyRegs; } + /// Tells the code generator that the target has BitExtract instructions. + /// The code generator will aggressively sink "shift"s into the blocks of + /// their users if the users will generate "and" instructions which can be + /// combined with "shift" to BitExtract instructions. + void setHasExtractBitsInsn(bool hasExtractInsn = true) { + HasExtractBitsInsn = hasExtractInsn; + } + /// Tells the code generator not to expand sequence of operations into a /// separate sequences that increases the amount of flow control. void setJumpIsExpensive(bool isExpensive = true) { @@ -1436,6 +1447,12 @@ private: /// the blocks of their users. bool HasMultipleConditionRegisters; + /// Tells the code generator that the target has BitExtract instructions. + /// The code generator will aggressively sink "shift"s into the blocks of + /// their users if the users will generate "and" instructions which can be + /// combined with "shift" to BitExtract instructions. + bool HasExtractBitsInsn; + /// Tells the code generator not to expand integer divides by constants into a /// sequence of muls, adds, and shifts. This is a hack until a real cost /// model is in place. If we ever optimize for size, this will be set to true diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 009e153e0e8..27fdd5187bd 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -628,6 +628,187 @@ static bool OptimizeCmpExpression(CmpInst *CI) { return MadeChange; } +/// isExtractBitsCandidateUse - Check if the candidates could +/// be combined with shift instruction, which includes: +/// 1. Truncate instruction +/// 2. And instruction and the imm is a mask of the low bits: +/// imm & (imm+1) == 0 +bool isExtractBitsCandidateUse(Instruction *User) { + if (!isa(User)) { + if (User->getOpcode() != Instruction::And || + !isa(User->getOperand(1))) + return false; + + unsigned Cimm = dyn_cast(User->getOperand(1))->getZExtValue(); + + if (Cimm & (Cimm + 1)) + return false; + } + return true; +} + +/// SinkShiftAndTruncate - sink both shift and truncate instruction +/// to the use of truncate's BB. +bool +SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, + DenseMap &InsertedShifts, + const TargetLowering &TLI) { + BasicBlock *UserBB = User->getParent(); + DenseMap InsertedTruncs; + TruncInst *TruncI = dyn_cast(User); + bool MadeChange = false; + + for (Value::user_iterator TruncUI = TruncI->user_begin(), + TruncE = TruncI->user_end(); + TruncUI != TruncE;) { + + Use &TruncTheUse = TruncUI.getUse(); + Instruction *TruncUser = cast(*TruncUI); + // Preincrement use iterator so we don't invalidate it. + + ++TruncUI; + + int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode()); + if (!ISDOpcode) + continue; + + // If the use is actually a legal node, there will not be an implicit + // truncate. + if (TLI.isOperationLegalOrCustom(ISDOpcode, + EVT::getEVT(TruncUser->getType()))) + continue; + + // Don't bother for PHI nodes. + if (isa(TruncUser)) + continue; + + BasicBlock *TruncUserBB = TruncUser->getParent(); + + if (UserBB == TruncUserBB) + continue; + + BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB]; + CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB]; + + if (!InsertedShift && !InsertedTrunc) { + BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); + // Sink the shift + if (ShiftI->getOpcode() == Instruction::AShr) + InsertedShift = + BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt); + else + InsertedShift = + BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt); + + // Sink the trunc + BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); + TruncInsertPt++; + + InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, + TruncI->getType(), "", TruncInsertPt); + + MadeChange = true; + + TruncTheUse = InsertedTrunc; + } + } + return MadeChange; +} + +/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if +/// the uses could potentially be combined with this shift instruction and +/// generate BitExtract instruction. It will only be applied if the architecture +/// supports BitExtract instruction. Here is an example: +/// BB1: +/// %x.extract.shift = lshr i64 %arg1, 32 +/// BB2: +/// %x.extract.trunc = trunc i64 %x.extract.shift to i16 +/// ==> +/// +/// BB2: +/// %x.extract.shift.1 = lshr i64 %arg1, 32 +/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16 +/// +/// CodeGen will recoginze the pattern in BB2 and generate BitExtract +/// instruction. +/// Return true if any changes are made. +static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, + const TargetLowering &TLI) { + BasicBlock *DefBB = ShiftI->getParent(); + + /// Only insert instructions in each block once. + DenseMap InsertedShifts; + + bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType())); + + bool MadeChange = false; + for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end(); + UI != E;) { + Use &TheUse = UI.getUse(); + Instruction *User = cast(*UI); + // Preincrement use iterator so we don't invalidate it. + ++UI; + + // Don't bother for PHI nodes. + if (isa(User)) + continue; + + if (!isExtractBitsCandidateUse(User)) + continue; + + BasicBlock *UserBB = User->getParent(); + + if (UserBB == DefBB) { + // If the shift and truncate instruction are in the same BB. The use of + // the truncate(TruncUse) may still introduce another truncate if not + // legal. In this case, we would like to sink both shift and truncate + // instruction to the BB of TruncUse. + // for example: + // BB1: + // i64 shift.result = lshr i64 opnd, imm + // trunc.result = trunc shift.result to i16 + // + // BB2: + // ----> We will have an implicit truncate here if the architecture does + // not have i16 compare. + // cmp i16 trunc.result, opnd2 + // + if (isa(User) && shiftIsLegal + // If the type of the truncate is legal, no trucate will be + // introduced in other basic blocks. + && (!TLI.isTypeLegal(TLI.getValueType(User->getType())))) + MadeChange = + SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI); + + continue; + } + // If we have already inserted a shift into this block, use it. + BinaryOperator *&InsertedShift = InsertedShifts[UserBB]; + + if (!InsertedShift) { + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + + if (ShiftI->getOpcode() == Instruction::AShr) + InsertedShift = + BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt); + else + InsertedShift = + BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt); + + MadeChange = true; + } + + // Replace a use of the shift with a use of the new shift. + TheUse = InsertedShift; + } + + // If we removed all uses, nuke the shift. + if (ShiftI->use_empty()) + ShiftI->eraseFromParent(); + + return MadeChange; +} + namespace { class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { protected: @@ -3225,6 +3406,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { return false; } + BinaryOperator *BinOp = dyn_cast(I); + + if (BinOp && (BinOp->getOpcode() == Instruction::AShr || + BinOp->getOpcode() == Instruction::LShr)) { + ConstantInt *CI = dyn_cast(BinOp->getOperand(1)); + if (TLI && CI && TLI->hasExtractBitsInsn()) + return OptimizeExtractBits(BinOp, CI, *TLI); + + return false; + } + if (GetElementPtrInst *GEPI = dyn_cast(I)) { if (GEPI->hasAllZeroIndices()) { /// The GEP operand must be a pointer, so must its result -> BitCast diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp index 8ef13e781b9..f96249937ec 100644 --- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp +++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp @@ -1183,6 +1183,14 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // Make sure to clamp the MSB so that we preserve the semantics of the // original operations. ClampMSB = true; + } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE && + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, + Srl_imm)) { + // If the shift result was truncated, we can still combine them. + Opd0 = Op0->getOperand(0).getOperand(0); + + // Use the type of SRL node. + VT = Opd0->getValueType(0); } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { Opd0 = Op0->getOperand(0); } else if (BiggerPattern) { @@ -1277,8 +1285,19 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, // we're looking for a shift of a shift uint64_t Shl_imm = 0; + uint64_t Trunc_bits = 0; if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { Opd0 = N->getOperand(0).getOperand(0); + } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL && + N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) { + // We are looking for a shift of truncate. Truncate from i64 to i32 could + // be considered as setting high 32 bits as zero. Our strategy here is to + // always generate 64bit UBFM. This consistency will help the CSE pass + // later find more redundancy. + Opd0 = N->getOperand(0).getOperand(0); + Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); + VT = Opd0->getValueType(0); + assert(VT == MVT::i64 && "the promoted type should be i64"); } else if (BiggerPattern) { // Let's pretend a 0 shift left has been performed. // FIXME: Currently we limit this to the bigger pattern case, @@ -1295,7 +1314,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && "bad amount in shift node!"); // Note: The width operand is encoded as width-1. - unsigned Width = VT.getSizeInBits() - Srl_imm - 1; + unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1; int sLSB = Srl_imm - Shl_imm; if (sLSB < 0) return false; @@ -1354,8 +1373,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { return NULL; EVT VT = N->getValueType(0); - SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT), - CurDAG->getTargetConstant(MSB, VT) }; + + // If the bit extract operation is 64bit but the original type is 32bit, we + // need to add one EXTRACT_SUBREG. + if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) { + SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64), + CurDAG->getTargetConstant(MSB, MVT::i64)}; + + SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64); + SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32); + MachineSDNode *Node = + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, + SDValue(BFM, 0), SubReg); + return Node; + } + + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT), + CurDAG->getTargetConstant(MSB, VT)}; return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3); } diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp index 7fee5646d1c..ee498e65c71 100644 --- a/lib/Target/ARM64/ARM64ISelLowering.cpp +++ b/lib/Target/ARM64/ARM64ISelLowering.cpp @@ -438,6 +438,8 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM) setDivIsWellDefined(true); RequireStrictAlign = StrictAlign; + + setHasExtractBitsInsn(true); } void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/ARM64/bitfield-extract.ll index 40dee710aa9..4a62cfaf7fc 100644 --- a/test/CodeGen/ARM64/bitfield-extract.ll +++ b/test/CodeGen/ARM64/bitfield-extract.ll @@ -1,3 +1,4 @@ +; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s ; RUN: llc < %s -march=arm64 | FileCheck %s %struct.X = type { i8, i8, [2 x i8] } %struct.Y = type { i32, i8 } @@ -404,3 +405,75 @@ define i64 @fct18(i32 %xor72) nounwind ssp { %result = and i64 %conv82, 255 ret i64 %result } + +; Using the access to the global array to keep the instruction and control flow. +@first_ones = external global [65536 x i8] + +; Function Attrs: nounwind readonly ssp +define i32 @fct19(i64 %arg1) nounwind readonly ssp { +; CHECK-LABEL: fct19: +entry: + %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16 + %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16 + %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32 + %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48 + %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift + %0 = load i8* %arrayidx3, align 1 + %conv = zext i8 %0 to i32 + br label %return + +; OPT-LABEL: if.end +if.end: ; preds = %entry +; OPT: lshr +; CHECK: ubfm [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #47 + %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16 + %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0 +; CHECK: cbz + br i1 %tobool6, label %if.end13, label %if.then7 + +; OPT-LABEL: if.then7 +if.then7: ; preds = %if.end +; OPT: lshr +; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. +; So neither of them should be in the assemble code. +; CHECK-NOT: and +; CHECK-NOT: ubfm + %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535 + %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10 + %1 = load i8* %arrayidx11, align 1 + %conv12 = zext i8 %1 to i32 + %add = add nsw i32 %conv12, 16 + br label %return + +; OPT-LABEL: if.end13 +if.end13: ; preds = %if.end +; OPT: lshr +; OPT: trunc +; CHECK: ubfm [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #31 + %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0 +; CHECK: cbz + br i1 %tobool16, label %return, label %if.then17 + +; OPT-LABEL: if.then17 +if.then17: ; preds = %if.end13 +; OPT: lshr +; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. +; So neither of them should be in the assemble code. +; CHECK-NOT: and +; CHECK-NOT: ubfm + %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535 + %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20 + %2 = load i8* %arrayidx21, align 1 + %conv22 = zext i8 %2 to i32 + %add23 = add nsw i32 %conv22, 32 + br label %return + +return: ; preds = %if.end13, %if.then17, %if.then7, %if.then +; CHECK: ret + %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ] + ret i32 %retval.0 +}