X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64FastISel.cpp;h=1e4499b8d53ae9031f53ce58e44fb062c5946992;hp=4f5de90afcf5d23151eda4233bf44ecf03f9b7ba;hb=d3a04223e84797a1432f10d7a153da6c258017a9;hpb=c9bc145e31764d5a6399642a8b8993882c133ed9 diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 4f5de90afcf..1e4499b8d53 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -78,11 +78,9 @@ class AArch64FastISel final : public FastISel { return Base.Reg; } void setOffsetReg(unsigned Reg) { - assert(isRegBase() && "Invalid offset register access!"); OffsetReg = Reg; } unsigned getOffsetReg() const { - assert(isRegBase() && "Invalid offset register access!"); return OffsetReg; } void setFI(unsigned FI) { @@ -133,6 +131,8 @@ private: bool selectShift(const Instruction *I); bool selectBitCast(const Instruction *I); bool selectFRem(const Instruction *I); + bool selectSDiv(const Instruction *I); + bool selectGetElementPtr(const Instruction *I); // Utility helper routines. bool isTypeLegal(Type *Ty, MVT &VT); @@ -149,6 +149,7 @@ private: unsigned Alignment); bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I, const Value *Cond); + bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT); // Emit helper routines. unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, @@ -172,12 +173,13 @@ private: bool WantResult = true); // Emit functions. + bool emitCompareAndBranch(const BranchInst *BI); bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt); bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt); bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS); - bool emitLoad(MVT VT, unsigned &ResultReg, Address Addr, - MachineMemOperand *MMO = nullptr); + unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true, + MachineMemOperand *MMO = nullptr); bool emitStore(MVT VT, unsigned SrcReg, Address Addr, MachineMemOperand *MMO = nullptr); unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); @@ -185,6 +187,7 @@ private: unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); + unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm); unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); @@ -254,6 +257,45 @@ public: #include "AArch64GenCallingConv.inc" +/// \brief Check if the sign-/zero-extend will be a noop. +static bool isIntExtFree(const Instruction *I) { + assert((isa(I) || isa(I)) && + "Unexpected integer extend instruction."); + assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() && + "Unexpected value type."); + bool IsZExt = isa(I); + + if (const auto *LI = dyn_cast(I->getOperand(0))) + if (LI->hasOneUse()) + return true; + + if (const auto *Arg = dyn_cast(I->getOperand(0))) + if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) + return true; + + return false; +} + +/// \brief Determine the implicit scale factor that is applied by a memory +/// operation for a given value type. +static unsigned getImplicitScaleFactor(MVT VT) { + switch (VT.SimpleTy) { + default: + return 0; // invalid + case MVT::i1: // fall-through + case MVT::i8: + return 1; + case MVT::i16: + return 2; + case MVT::i32: // fall-through + case MVT::f32: + return 4; + case MVT::i64: // fall-through + case MVT::f64: + return 8; + } +} + CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { if (CC == CallingConv::WebKit_JS) return CC_AArch64_WebKit_JS; @@ -425,6 +467,19 @@ unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) { return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true); } +/// \brief Check if the multiply is by a power-of-2 constant. +static bool isMulPowOf2(const Value *I) { + if (const auto *MI = dyn_cast(I)) { + if (const auto *C = dyn_cast(MI->getOperand(0))) + if (C->getValue().isPowerOf2()) + return true; + if (const auto *C = dyn_cast(MI->getOperand(1))) + if (C->getValue().isPowerOf2()) + return true; + } + return false; +} + // Computes the address to get to an object. bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) { @@ -536,7 +591,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) std::swap(LHS, RHS); if (const ConstantInt *CI = dyn_cast(RHS)) { - Addr.setOffset(Addr.getOffset() + (uint64_t)CI->getSExtValue()); + Addr.setOffset(Addr.getOffset() + CI->getSExtValue()); return computeAddress(LHS, Addr, Ty); } @@ -547,66 +602,229 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) break; } - case Instruction::Shl: + case Instruction::Sub: { + // Subs of constants are common and easy enough. + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const ConstantInt *CI = dyn_cast(RHS)) { + Addr.setOffset(Addr.getOffset() - CI->getSExtValue()); + return computeAddress(LHS, Addr, Ty); + } + break; + } + case Instruction::Shl: { if (Addr.getOffsetReg()) break; - if (const auto *CI = dyn_cast(U->getOperand(1))) { - unsigned Val = CI->getZExtValue(); - if (Val < 1 || Val > 3) - break; + const auto *CI = dyn_cast(U->getOperand(1)); + if (!CI) + break; - uint64_t NumBytes = 0; - if (Ty && Ty->isSized()) { - uint64_t NumBits = DL.getTypeSizeInBits(Ty); - NumBytes = NumBits / 8; - if (!isPowerOf2_64(NumBits)) - NumBytes = 0; - } + unsigned Val = CI->getZExtValue(); + if (Val < 1 || Val > 3) + break; - if (NumBytes != (1ULL << Val)) - break; + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; - Addr.setShift(Val); - Addr.setExtendType(AArch64_AM::LSL); + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); - if (const auto *I = dyn_cast(U->getOperand(0))) - if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) - U = I; + const Value *Src = U->getOperand(0); + if (const auto *I = dyn_cast(Src)) + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + Src = I; - if (const auto *ZE = dyn_cast(U)) - if (ZE->getOperand(0)->getType()->isIntegerTy(32)) + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast(Src)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast(Src)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } - if (const auto *SE = dyn_cast(U)) - if (SE->getOperand(0)->getType()->isIntegerTy(32)) - Addr.setExtendType(AArch64_AM::SXTW); + if (const auto *AI = dyn_cast(Src)) + if (AI->getOpcode() == Instruction::And) { + const Value *LHS = AI->getOperand(0); + const Value *RHS = AI->getOperand(1); + + if (const auto *C = dyn_cast(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setExtendType(AArch64_AM::UXTW); + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } + } - unsigned Reg = getRegForValue(U->getOperand(0)); - if (!Reg) - return false; - Addr.setOffsetReg(Reg); - return true; + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::Mul: { + if (Addr.getOffsetReg()) + break; + + if (!isMulPowOf2(U)) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + // Canonicalize power-of-2 value to the RHS. + if (const auto *C = dyn_cast(LHS)) + if (C->getValue().isPowerOf2()) + std::swap(LHS, RHS); + + assert(isa(RHS) && "Expected an ConstantInt."); + const auto *C = cast(RHS); + unsigned Val = C->getValue().logBase2(); + if (Val < 1 || Val > 3) + break; + + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; } + + if (NumBytes != (1ULL << Val)) + break; + + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); + + const Value *Src = LHS; + if (const auto *I = dyn_cast(Src)) + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + Src = I; + + + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast(Src)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast(Src)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::And: { + if (Addr.getOffsetReg()) + break; + + if (DL.getTypeSizeInBits(Ty) != 8) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const auto *C = dyn_cast(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setShift(0); + Addr.setExtendType(AArch64_AM::LSL); + Addr.setExtendType(AArch64_AM::UXTW); + + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } break; } + case Instruction::SExt: + case Instruction::ZExt: { + if (!Addr.getReg() || Addr.getOffsetReg()) + break; - if (Addr.getReg()) { - if (!Addr.getOffsetReg()) { - unsigned Reg = getRegForValue(Obj); - if (!Reg) - return false; - Addr.setOffsetReg(Reg); - return true; + const Value *Src = nullptr; + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast(U)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast(U)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } } - return false; + + if (!Src) + break; + + Addr.setShift(0); + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; } + } // end switch - unsigned Reg = getRegForValue(Obj); - if (!Reg) - return false; - Addr.setReg(Reg); - return true; + if (Addr.isRegBase() && !Addr.getReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setReg(Reg); + return true; + } + + if (!Addr.getOffsetReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + + return false; } bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { @@ -707,17 +925,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const { } bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { - unsigned ScaleFactor; - switch (VT.SimpleTy) { - default: return false; - case MVT::i1: // fall-through - case MVT::i8: ScaleFactor = 1; break; - case MVT::i16: ScaleFactor = 2; break; - case MVT::i32: // fall-through - case MVT::f32: ScaleFactor = 4; break; - case MVT::i64: // fall-through - case MVT::f64: ScaleFactor = 8; break; - } + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + return false; bool ImmediateOffsetNeedsLowering = false; bool RegisterOffsetNeedsLowering = false; @@ -731,8 +941,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Cannot encode an offset register and an immediate offset in the same // instruction. Fold the immediate offset into the load/store instruction and // emit an additonal add to take care of the offset register. - if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.isRegBase() && - Addr.getOffsetReg()) + if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) RegisterOffsetNeedsLowering = true; // Cannot encode zero register as base. @@ -742,7 +951,8 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // If this is a stack pointer and the offset needs to be simplified then put // the alloca address into a register, set the base type back to register and // continue. This should almost never happen. - if (ImmediateOffsetNeedsLowering && Addr.isFIBase()) { + if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase()) + { unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) @@ -792,10 +1002,10 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Since the offset is too large for the load/store instruction get the // reg+offset into a register. if (ImmediateOffsetNeedsLowering) { - unsigned ResultReg = 0; + unsigned ResultReg; if (Addr.getReg()) - ResultReg = fastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), - /*IsKill=*/false, Offset, MVT::i64); + // Try to fold the immediate into the add instruction. + ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset); else ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset); @@ -839,10 +1049,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, MIB.addReg(Addr.getOffsetReg()); MIB.addImm(IsSigned); MIB.addImm(Addr.getShift() != 0); - } else { - MIB.addReg(Addr.getReg()); - MIB.addImm(Offset); - } + } else + MIB.addReg(Addr.getReg()).addImm(Offset); } if (MMO) @@ -879,8 +1087,13 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (UseAdd && isa(LHS) && !isa(RHS)) std::swap(LHS, RHS); + // Canonicalize mul by power of 2 to the RHS. + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + // Canonicalize shift immediate to the RHS. - if (UseAdd && isValueAvailable(LHS)) + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) if (const auto *SI = dyn_cast(LHS)) if (isa(SI->getOperand(1))) if (SI->getOpcode() == Instruction::Shl || @@ -910,7 +1123,8 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, return ResultReg; // Only extend the RHS within the instruction if there is a valid extend type. - if (ExtendType != AArch64_AM::InvalidShiftExtend && isValueAvailable(RHS)) { + if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() && + isValueAvailable(RHS)) { if (const auto *SI = dyn_cast(RHS)) if (const auto *C = dyn_cast(SI->getOperand(1))) if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { @@ -930,8 +1144,28 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, ExtendType, 0, SetFlags, WantResult); } + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast(RHS)->getOperand(0); + const Value *MulRHS = cast(RHS)->getOperand(1); + + if (const auto *C = dyn_cast(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast(MulRHS)->getValue().logBase2(); + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + AArch64_AM::LSL, ShiftVal, SetFlags, WantResult); + } + // Check if the shift can be folded into the instruction. - if (isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) if (const auto *SI = dyn_cast(RHS)) { if (const auto *C = dyn_cast(SI->getOperand(1))) { AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; @@ -1197,6 +1431,30 @@ unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, IsZExt); } +/// \brief This method is a wrapper to simplify add emission. +/// +/// First try to emit an add with an immediate operand using emitAddSub_ri. If +/// that fails, then try to materialize the immediate into a register and use +/// emitAddSub_rr instead. +unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, + int64_t Imm) { + unsigned ResultReg; + if (Imm < 0) + ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm); + else + ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm); + + if (ResultReg) + return ResultReg; + + unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm); + if (!CReg) + return 0; + + ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true); + return ResultReg; +} + unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags, bool WantResult, bool IsZExt) { return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult, @@ -1226,12 +1484,16 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (isa(LHS) && !isa(RHS)) std::swap(LHS, RHS); + // Canonicalize mul by power-of-2 to the RHS. + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + // Canonicalize shift immediate to the RHS. - if (isValueAvailable(LHS)) - if (const auto *SI = dyn_cast(LHS)) + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (const auto *SI = dyn_cast(LHS)) if (isa(SI->getOperand(1))) - if (SI->getOpcode() == Instruction::Shl) - std::swap(LHS, RHS); + std::swap(LHS, RHS); unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) @@ -1246,19 +1508,39 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (ResultReg) return ResultReg; + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast(RHS)->getOperand(0); + const Value *MulRHS = cast(RHS)->getOperand(1); + + if (const auto *C = dyn_cast(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast(MulRHS)->getValue().logBase2(); + + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + } + // Check if the shift can be folded into the instruction. - if (isValueAvailable(RHS)) - if (const auto *SI = dyn_cast(RHS)) - if (const auto *C = dyn_cast(SI->getOperand(1))) - if (SI->getOpcode() == Instruction::Shl) { - uint64_t ShiftVal = C->getZExtValue(); - unsigned RHSReg = getRegForValue(SI->getOperand(0)); - if (!RHSReg) - return 0; - bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); - } + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (const auto *SI = dyn_cast(RHS)) + if (const auto *C = dyn_cast(SI->getOperand(1))) { + uint64_t ShiftVal = C->getZExtValue(); + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1363,23 +1645,15 @@ unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm); } -bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr, - MachineMemOperand *MMO) { +unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, + bool WantZExt, MachineMemOperand *MMO) { // Simplify this down to something we can handle. if (!simplifyAddress(Addr, VT)) - return false; + return 0; - unsigned ScaleFactor; - switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected value type."); - case MVT::i1: // fall-through - case MVT::i8: ScaleFactor = 1; break; - case MVT::i16: ScaleFactor = 2; break; - case MVT::i32: // fall-through - case MVT::f32: ScaleFactor = 4; break; - case MVT::i64: // fall-through - case MVT::f64: ScaleFactor = 8; break; - } + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); // Negative offsets require unscaled, 9-bit, signed immediate offsets. // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. @@ -1389,20 +1663,54 @@ bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr, ScaleFactor = 1; } - static const unsigned OpcTable[4][6] = { - { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, AArch64::LDURXi, - AArch64::LDURSi, AArch64::LDURDi }, - { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, AArch64::LDRXui, - AArch64::LDRSui, AArch64::LDRDui }, - { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, AArch64::LDRXroX, - AArch64::LDRSroX, AArch64::LDRDroX }, - { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, AArch64::LDRXroW, - AArch64::LDRSroW, AArch64::LDRDroW } + static const unsigned GPOpcTable[2][8][4] = { + // Sign-extend. + { { AArch64::LDURSBWi, AArch64::LDURSHWi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURSBXi, AArch64::LDURSHXi, AArch64::LDURSWi, + AArch64::LDURXi }, + { AArch64::LDRSBWui, AArch64::LDRSHWui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRSBXui, AArch64::LDRSHXui, AArch64::LDRSWui, + AArch64::LDRXui }, + { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW, + AArch64::LDRXroW } + }, + // Zero-extend. + { { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW } + } + }; + + static const unsigned FPOpcTable[4][2] = { + { AArch64::LDURSi, AArch64::LDURDi }, + { AArch64::LDRSui, AArch64::LDRDui }, + { AArch64::LDRSroX, AArch64::LDRDroX }, + { AArch64::LDRSroW, AArch64::LDRDroW } }; unsigned Opc; const TargetRegisterClass *RC; - bool VTIsi1 = false; bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() && Addr.getOffsetReg(); unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0; @@ -1410,30 +1718,65 @@ bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr, Addr.getExtendType() == AArch64_AM::SXTW) Idx++; + bool IsRet64Bit = RetVT == MVT::i64; switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected value type."); - case MVT::i1: VTIsi1 = true; // Intentional fall-through. - case MVT::i8: Opc = OpcTable[Idx][0]; RC = &AArch64::GPR32RegClass; break; - case MVT::i16: Opc = OpcTable[Idx][1]; RC = &AArch64::GPR32RegClass; break; - case MVT::i32: Opc = OpcTable[Idx][2]; RC = &AArch64::GPR32RegClass; break; - case MVT::i64: Opc = OpcTable[Idx][3]; RC = &AArch64::GPR64RegClass; break; - case MVT::f32: Opc = OpcTable[Idx][4]; RC = &AArch64::FPR32RegClass; break; - case MVT::f64: Opc = OpcTable[Idx][5]; RC = &AArch64::FPR64RegClass; break; + default: + llvm_unreachable("Unexpected value type."); + case MVT::i1: // Intentional fall-through. + case MVT::i8: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i16: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i32: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i64: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3]; + RC = &AArch64::GPR64RegClass; + break; + case MVT::f32: + Opc = FPOpcTable[Idx][0]; + RC = &AArch64::FPR32RegClass; + break; + case MVT::f64: + Opc = FPOpcTable[Idx][1]; + RC = &AArch64::FPR64RegClass; + break; } // Create the base instruction, then add the operands. - ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO); // Loading an i1 requires special handling. - if (VTIsi1) { + if (VT == MVT::i1) { unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1); assert(ANDReg && "Unexpected AND instruction emission failure."); ResultReg = ANDReg; } - return true; + + // For zero-extending loads to 64bit we emit a 32bit load and then convert + // the 32bit reg to a 64bit reg. + if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(ResultReg, getKillRegState(true)) + .addImm(AArch64::sub_32); + ResultReg = Reg64; + } + return ResultReg; } bool AArch64FastISel::selectAddSub(const Instruction *I) { @@ -1505,10 +1848,82 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { if (!computeAddress(I->getOperand(0), Addr, I->getType())) return false; - unsigned ResultReg; - if (!emitLoad(VT, ResultReg, Addr, createMachineMemOperandFor(I))) + // Fold the following sign-/zero-extend into the load instruction. + bool WantZExt = true; + MVT RetVT = VT; + const Value *IntExtVal = nullptr; + if (I->hasOneUse()) { + if (const auto *ZE = dyn_cast(I->use_begin()->getUser())) { + if (isTypeSupported(ZE->getType(), RetVT)) + IntExtVal = ZE; + else + RetVT = VT; + } else if (const auto *SE = dyn_cast(I->use_begin()->getUser())) { + if (isTypeSupported(SE->getType(), RetVT)) + IntExtVal = SE; + else + RetVT = VT; + WantZExt = false; + } + } + + unsigned ResultReg = + emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I)); + if (!ResultReg) return false; + // There are a few different cases we have to handle, because the load or the + // sign-/zero-extend might not be selected by FastISel if we fall-back to + // SelectionDAG. There is also an ordering issue when both instructions are in + // different basic blocks. + // 1.) The load instruction is selected by FastISel, but the integer extend + // not. This usually happens when the integer extend is in a different + // basic block and SelectionDAG took over for that basic block. + // 2.) The load instruction is selected before the integer extend. This only + // happens when the integer extend is in a different basic block. + // 3.) The load instruction is selected by SelectionDAG and the integer extend + // by FastISel. This happens if there are instructions between the load + // and the integer extend that couldn't be selected by FastISel. + if (IntExtVal) { + // The integer extend hasn't been emitted yet. FastISel or SelectionDAG + // could select it. Emit a copy to subreg if necessary. FastISel will remove + // it when it selects the integer extend. + unsigned Reg = lookUpRegForValue(IntExtVal); + if (!Reg) { + if (RetVT == MVT::i64 && VT <= MVT::i32) { + if (WantZExt) { + // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG). + std::prev(FuncInfo.InsertPt)->eraseFromParent(); + ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg(); + } else + ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, + /*IsKill=*/true, + AArch64::sub_32); + } + updateValueMap(I, ResultReg); + return true; + } + + // The integer extend has already been emitted - delete all the instructions + // that have been emitted by the integer extend lowering code and use the + // result from the load instruction directly. + while (Reg) { + auto *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) + break; + Reg = 0; + for (auto &Opnd : MI->uses()) { + if (Opnd.isReg()) { + Reg = Opnd.getReg(); + break; + } + } + MI->eraseFromParent(); + } + updateValueMap(IntExtVal, ResultReg); + return true; + } + updateValueMap(I, ResultReg); return true; } @@ -1519,17 +1934,9 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, if (!simplifyAddress(Addr, VT)) return false; - unsigned ScaleFactor; - switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected value type."); - case MVT::i1: // fall-through - case MVT::i8: ScaleFactor = 1; break; - case MVT::i16: ScaleFactor = 2; break; - case MVT::i32: // fall-through - case MVT::f32: ScaleFactor = 4; break; - case MVT::i64: // fall-through - case MVT::f64: ScaleFactor = 8; break; - } + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); // Negative offsets require unscaled, 9-bit, signed immediate offsets. // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. @@ -1539,7 +1946,6 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, ScaleFactor = 1; } - static const unsigned OpcTable[4][6] = { { AArch64::STURBBi, AArch64::STURHHi, AArch64::STURWi, AArch64::STURXi, AArch64::STURSi, AArch64::STURDi }, @@ -1549,7 +1955,6 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, AArch64::STRSroX, AArch64::STRDroX }, { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW, AArch64::STRSroW, AArch64::STRDroW } - }; unsigned Opc; @@ -1673,6 +2078,132 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { } } +/// \brief Try to emit a combined compare-and-branch instruction. +bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { + assert(isa(BI->getCondition()) && "Expected cmp instruction"); + const CmpInst *CI = cast(BI->getCondition()); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + Type *Ty = LHS->getType(); + if (!Ty->isIntegerTy()) + return false; + + unsigned BW = cast(Ty)->getBitWidth(); + if (BW != 1 && BW != 8 && BW != 16 && BW != 32 && BW != 64) + return false; + + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + int TestBit = -1; + bool IsCmpNE; + if ((Predicate == CmpInst::ICMP_EQ) || (Predicate == CmpInst::ICMP_NE)) { + if (const auto *C = dyn_cast(LHS)) + if (C->isNullValue()) + std::swap(LHS, RHS); + + if (!isa(RHS)) + return false; + + if (!cast(RHS)->isNullValue()) + return false; + + if (const auto *AI = dyn_cast(LHS)) + if (AI->getOpcode() == Instruction::And) { + const Value *AndLHS = AI->getOperand(0); + const Value *AndRHS = AI->getOperand(1); + + if (const auto *C = dyn_cast(AndLHS)) + if (C->getValue().isPowerOf2()) + std::swap(AndLHS, AndRHS); + + if (const auto *C = dyn_cast(AndRHS)) + if (C->getValue().isPowerOf2()) { + TestBit = C->getValue().logBase2(); + LHS = AndLHS; + } + } + IsCmpNE = Predicate == CmpInst::ICMP_NE; + } else if (Predicate == CmpInst::ICMP_SLT) { + if (!isa(RHS)) + return false; + + if (!cast(RHS)->isNullValue()) + return false; + + TestBit = BW - 1; + IsCmpNE = true; + } else if (Predicate == CmpInst::ICMP_SGT) { + if (!isa(RHS)) + return false; + + if (cast(RHS)->getValue() != -1) + return false; + + TestBit = BW - 1; + IsCmpNE = false; + } else + return false; + + static const unsigned OpcTable[2][2][2] = { + { {AArch64::CBZW, AArch64::CBZX }, + {AArch64::CBNZW, AArch64::CBNZX} }, + { {AArch64::TBZW, AArch64::TBZX }, + {AArch64::TBNZW, AArch64::TBNZX} } + }; + + bool IsBitTest = TestBit != -1; + bool Is64Bit = BW == 64; + if (TestBit < 32 && TestBit >= 0) + Is64Bit = false; + + unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit]; + const MCInstrDesc &II = TII.get(Opc); + + unsigned SrcReg = getRegForValue(LHS); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(LHS); + + if (BW == 64 && !Is64Bit) + SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, + AArch64::sub_32); + + if ((BW < 32) && !IsBitTest) { + EVT CmpEVT = TLI.getValueType(Ty, true); + SrcReg = + emitIntExt(CmpEVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ true); + } + + // Emit the combined compare and branch instruction. + SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + if (IsBitTest) + MIB.addImm(TestBit); + MIB.addMBB(TBB); + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + fastEmitBranch(FBB, DbgLoc); + + return true; +} + bool AArch64FastISel::selectBranch(const Instruction *I) { const BranchInst *BI = cast(I); if (BI->isUnconditional()) { @@ -1686,16 +2217,59 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { AArch64CC::CondCode CC = AArch64CC::NE; if (const CmpInst *CI = dyn_cast(BI->getCondition())) { - if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { - // We may not handle every CC for now. - CC = getCompareCC(CI->getPredicate()); - if (CC == AArch64CC::AL) - return false; + if (CI->hasOneUse() && isValueAvailable(CI)) { + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + fastEmitBranch(FBB, DbgLoc); + return true; + case CmpInst::FCMP_TRUE: + fastEmitBranch(TBB, DbgLoc); + return true; + } + + // Try to emit a combined compare-and-branch first. + if (emitCompareAndBranch(BI)) + return true; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } // Emit the cmp. if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) return false; + // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch + // instruction. + CC = getCompareCC(Predicate); + AArch64CC::CondCode ExtraCC = AArch64CC::AL; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + ExtraCC = AArch64CC::EQ; + CC = AArch64CC::VS; + break; + case CmpInst::FCMP_ONE: + ExtraCC = AArch64CC::MI; + CC = AArch64CC::GT; + break; + } + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + + // Emit the extra branch for FCMP_UEQ and FCMP_ONE. + if (ExtraCC != AArch64CC::AL) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(ExtraCC) + .addMBB(TBB); + } + // Emit the branch. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) .addImm(CC) @@ -1713,8 +2287,8 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { } } else if (TruncInst *TI = dyn_cast(BI->getCondition())) { MVT SrcVT; - if (TI->hasOneUse() && TI->getParent() == I->getParent() && - (isTypeSupported(TI->getOperand(0)->getType(), SrcVT))) { + if (TI->hasOneUse() && isValueAvailable(TI) && + isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) { unsigned CondReg = getRegForValue(TI->getOperand(0)); if (!CondReg) return false; @@ -1749,8 +2323,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { fastEmitBranch(FBB, DbgLoc); return true; } - } else if (const ConstantInt *CI = - dyn_cast(BI->getCondition())) { + } else if (const auto *CI = dyn_cast(BI->getCondition())) { uint64_t Imm = CI->getZExtValue(); MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) @@ -2477,14 +3050,11 @@ bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src, } } - bool RV; - unsigned ResultReg; - RV = emitLoad(VT, ResultReg, Src); - if (!RV) + unsigned ResultReg = emitLoad(VT, VT, Src); + if (!ResultReg) return false; - RV = emitStore(VT, ResultReg, Dest); - if (!RV) + if (!emitStore(VT, ResultReg, Dest)) return false; int64_t Size = VT.getSizeInBits() / 8; @@ -2522,19 +3092,53 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, if (RetVT != MVT::i32 && RetVT != MVT::i64) return false; + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa(LHS) && !isa(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + // Simplify multiplies. + unsigned IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::sadd_with_overflow; + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::uadd_with_overflow; + break; + } + AArch64CC::CondCode TmpCC; - switch (II->getIntrinsicID()) { - default: return false; - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: TmpCC = AArch64CC::VS; break; - case Intrinsic::uadd_with_overflow: TmpCC = AArch64CC::HS; break; - case Intrinsic::usub_with_overflow: TmpCC = AArch64CC::LO; break; - case Intrinsic::smul_with_overflow: - case Intrinsic::umul_with_overflow: TmpCC = AArch64CC::NE; break; + switch (IID) { + default: + return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + TmpCC = AArch64CC::VS; + break; + case Intrinsic::uadd_with_overflow: + TmpCC = AArch64CC::HS; + break; + case Intrinsic::usub_with_overflow: + TmpCC = AArch64CC::LO; + break; + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + TmpCC = AArch64CC::NE; + break; } // Check if both instructions are in the same basic block. - if (II->getParent() != I->getParent()) + if (!isValueAvailable(II)) return false; // Make sure nothing is in the way @@ -2739,9 +3343,30 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { isCommutativeIntrinsic(II)) std::swap(LHS, RHS); + // Simplify multiplies. + unsigned IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::sadd_with_overflow; + RHS = LHS; + } + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::uadd_with_overflow; + RHS = LHS; + } + break; + } + unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0; AArch64CC::CondCode CC = AArch64CC::Invalid; - switch (II->getIntrinsicID()) { + switch (IID) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true); @@ -2831,6 +3456,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass, AArch64::WZR, /*IsKill=*/true, AArch64::WZR, /*IsKill=*/true, getInvertedCondCode(CC)); + (void)ResultReg2; assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers."); updateValueMap(II, ResultReg1, 2); @@ -3104,8 +3730,9 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, bool IsZext) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); - assert((SrcVT == MVT::i8 || SrcVT == MVT::i16 || SrcVT == MVT::i32 || - SrcVT == MVT::i64) && "Unexpected source value type."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || RetVT == MVT::i64) && "Unexpected return value type."); @@ -3431,48 +4058,146 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm); } -bool AArch64FastISel::selectIntExt(const Instruction *I) { - // On ARM, in general, integer casts don't involve legal types; this code - // handles promotable integers. The high bits for a type smaller than - // the register size are assumed to be undefined. - Type *DestTy = I->getType(); - Value *Src = I->getOperand(0); - Type *SrcTy = Src->getType(); +static bool isZExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: + return false; + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRBBroX: + case AArch64::LDRHHroX: + case AArch64::LDRWroX: + case AArch64::LDRBBroW: + case AArch64::LDRHHroW: + case AArch64::LDRWroW: + return true; + } +} - unsigned SrcReg = getRegForValue(Src); - if (!SrcReg) +static bool isSExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: return false; + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::LDRSBXui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::LDRSBWroX: + case AArch64::LDRSHWroX: + case AArch64::LDRSBXroX: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSHWroW: + case AArch64::LDRSBXroW: + case AArch64::LDRSHXroW: + case AArch64::LDRSWroW: + return true; + } +} - EVT SrcEVT = TLI.getValueType(SrcTy, true); - EVT DestEVT = TLI.getValueType(DestTy, true); - if (!SrcEVT.isSimple()) +bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, + MVT SrcVT) { + const auto *LI = dyn_cast(I->getOperand(0)); + if (!LI || !LI->hasOneUse()) return false; - if (!DestEVT.isSimple()) + + // Check if the load instruction has already been selected. + unsigned Reg = lookUpRegForValue(LI); + if (!Reg) return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - MVT DestVT = DestEVT.getSimpleVT(); - unsigned ResultReg = 0; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) + return false; + + // Check if the correct load instruction has been emitted - SelectionDAG might + // have emitted a zero-extending load, but we need a sign-extending load. + bool IsZExt = isa(I); + const auto *LoadMI = MI; + if (LoadMI->getOpcode() == TargetOpcode::COPY && + LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) { + unsigned LoadReg = MI->getOperand(1).getReg(); + LoadMI = MRI.getUniqueVRegDef(LoadReg); + assert(LoadMI && "Expected valid instruction"); + } + if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI))) + return false; + + // Nothing to be done. + if (RetVT != MVT::i64 || SrcVT > MVT::i32) { + updateValueMap(I, Reg); + return true; + } + + if (IsZExt) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(Reg, getKillRegState(true)) + .addImm(AArch64::sub_32); + Reg = Reg64; + } else { + assert((MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getSubReg() == AArch64::sub_32) && + "Expected copy instruction"); + Reg = MI->getOperand(1).getReg(); + MI->eraseFromParent(); + } + updateValueMap(I, Reg); + return true; +} + +bool AArch64FastISel::selectIntExt(const Instruction *I) { + assert((isa(I) || isa(I)) && + "Unexpected integer extend instruction."); + MVT RetVT; + MVT SrcVT; + if (!isTypeSupported(I->getType(), RetVT)) + return false; + + if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT)) + return false; + + // Try to optimize already sign-/zero-extended values from load instructions. + if (optimizeIntExtLoad(I, RetVT, SrcVT)) + return true; + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(I->getOperand(0)); + // Try to optimize already sign-/zero-extended values from function arguments. bool IsZExt = isa(I); - // Check if it is an argument and if it is already zero/sign-extended. - if (const auto *Arg = dyn_cast(Src)) { + if (const auto *Arg = dyn_cast(I->getOperand(0))) { if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) { - if (DestVT == MVT::i64) { - ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); + if (RetVT == MVT::i64 && SrcVT != MVT::i64) { + unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), ResultReg) - .addImm(0) - .addReg(SrcReg) - .addImm(AArch64::sub_32); - } else - ResultReg = SrcReg; + .addImm(0) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(AArch64::sub_32); + SrcReg = ResultReg; + } + updateValueMap(I, SrcReg); + return true; } } - if (!ResultReg) - ResultReg = emitIntExt(SrcVT, SrcReg, DestVT, IsZExt); - + unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt); if (!ResultReg) return false; @@ -3527,15 +4252,58 @@ bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { } bool AArch64FastISel::selectMul(const Instruction *I) { - EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true); - if (!SrcEVT.isSimple()) + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - // Must be simple value type. Don't handle vectors. - if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && - SrcVT != MVT::i8) - return false; + if (VT.isVector()) + return selectBinaryOp(I, ISD::MUL); + + const Value *Src0 = I->getOperand(0); + const Value *Src1 = I->getOperand(1); + if (const auto *C = dyn_cast(Src0)) + if (C->getValue().isPowerOf2()) + std::swap(Src0, Src1); + + // Try to simplify to a shift instruction. + if (const auto *C = dyn_cast(Src1)) + if (C->getValue().isPowerOf2()) { + uint64_t ShiftVal = C->getValue().logBase2(); + MVT SrcVT = VT; + bool IsZExt = true; + if (const auto *ZExt = dyn_cast(Src0)) { + if (!isIntExtFree(ZExt)) { + MVT VT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = true; + Src0 = ZExt->getOperand(0); + } + } + } else if (const auto *SExt = dyn_cast(Src0)) { + if (!isIntExtFree(SExt)) { + MVT VT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = false; + Src0 = SExt->getOperand(0); + } + } + } + + unsigned Src0Reg = getRegForValue(Src0); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(Src0); + + unsigned ResultReg = + emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt); + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + } unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) @@ -3547,8 +4315,7 @@ bool AArch64FastISel::selectMul(const Instruction *I) { return false; bool Src1IsKill = hasTrivialKill(I->getOperand(1)); - unsigned ResultReg = - emitMul_rr(SrcVT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); + unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); if (!ResultReg) return false; @@ -3572,18 +4339,22 @@ bool AArch64FastISel::selectShift(const Instruction *I) { bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true; const Value *Op0 = I->getOperand(0); if (const auto *ZExt = dyn_cast(Op0)) { - MVT TmpVT; - if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) { - SrcVT = TmpVT; - IsZExt = true; - Op0 = ZExt->getOperand(0); + if (!isIntExtFree(ZExt)) { + MVT TmpVT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = true; + Op0 = ZExt->getOperand(0); + } } } else if (const auto *SExt = dyn_cast(Op0)) { - MVT TmpVT; - if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) { - SrcVT = TmpVT; - IsZExt = false; - Op0 = SExt->getOperand(0); + if (!isIntExtFree(SExt)) { + MVT TmpVT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = false; + Op0 = SExt->getOperand(0); + } } } @@ -3720,6 +4491,147 @@ bool AArch64FastISel::selectFRem(const Instruction *I) { return true; } +bool AArch64FastISel::selectSDiv(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + if (!isa(I->getOperand(1))) + return selectBinaryOp(I, ISD::SDIV); + + const APInt &C = cast(I->getOperand(1))->getValue(); + if ((VT != MVT::i32 && VT != MVT::i64) || !C || + !(C.isPowerOf2() || (-C).isPowerOf2())) + return selectBinaryOp(I, ISD::SDIV); + + unsigned Lg2 = C.countTrailingZeros(); + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); + + if (cast(I)->isExact()) { + unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2); + if (!ResultReg) + return false; + updateValueMap(I, ResultReg); + return true; + } + + int64_t Pow2MinusOne = (1ULL << Lg2) - 1; + unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne); + if (!AddReg) + return false; + + // (Src0 < 0) ? Pow2 - 1 : 0; + if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0)) + return false; + + unsigned SelectOpc; + const TargetRegisterClass *RC; + if (VT == MVT::i64) { + SelectOpc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; + } else { + SelectOpc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; + } + unsigned SelectReg = + fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg, + Src0IsKill, AArch64CC::LT); + if (!SelectReg) + return false; + + // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also + // negate the result. + unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + unsigned ResultReg; + if (C.isNegative()) + ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true, + SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2); + else + ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +/// This is mostly a copy of the existing FastISel GEP code, but we have to +/// duplicate it for AArch64, because otherwise we would bail out even for +/// simple cases. This is because the standard fastEmit functions don't cover +/// MUL at all and ADD is lowered very inefficientily. +bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { + unsigned N = getRegForValue(I->getOperand(0)); + if (!N) + return false; + bool NIsKill = hasTrivialKill(I->getOperand(0)); + + // Keep a running tab of the total offset to coalesce multiple N = N + Offset + // into a single N = N + TotalOffset. + uint64_t TotalOffs = 0; + Type *Ty = I->getOperand(0)->getType(); + MVT VT = TLI.getPointerTy(); + for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { + const Value *Idx = *OI; + if (auto *StTy = dyn_cast(Ty)) { + unsigned Field = cast(Idx)->getZExtValue(); + // N = N + Offset + if (Field) + TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); + Ty = StTy->getElementType(Field); + } else { + Ty = cast(Ty)->getElementType(); + // If this is a constant subscript, handle it quickly. + if (const auto *CI = dyn_cast(Idx)) { + if (CI->isZero()) + continue; + // N = N + Offset + TotalOffs += + DL.getTypeAllocSize(Ty) * cast(CI)->getSExtValue(); + continue; + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + NIsKill = true; + TotalOffs = 0; + } + + // N = N + Idx * ElementSize; + uint64_t ElementSize = DL.getTypeAllocSize(Ty); + std::pair Pair = getRegForGEPIndex(Idx); + unsigned IdxN = Pair.first; + bool IdxNIsKill = Pair.second; + if (!IdxN) + return false; + + if (ElementSize != 1) { + unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize); + if (!C) + return false; + IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true); + if (!IdxN) + return false; + IdxNIsKill = true; + } + N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill); + if (!N) + return false; + } + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + } + updateValueMap(I, N); + return true; +} + bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: @@ -3728,9 +4640,9 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { case Instruction::Sub: return selectAddSub(I); case Instruction::Mul: - if (!selectBinaryOp(I, ISD::MUL)) - return selectMul(I); - return true; + return selectMul(I); + case Instruction::SDiv: + return selectSDiv(I); case Instruction::SRem: if (!selectBinaryOp(I, ISD::SREM)) return selectRem(I, ISD::SREM); @@ -3762,13 +4674,8 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { case Instruction::FPToUI: return selectFPToInt(I, /*Signed=*/false); case Instruction::ZExt: - if (!selectCast(I, ISD::ZERO_EXTEND)) - return selectIntExt(I); - return true; case Instruction::SExt: - if (!selectCast(I, ISD::SIGN_EXTEND)) - return selectIntExt(I); - return true; + return selectIntExt(I); case Instruction::Trunc: if (!selectCast(I, ISD::TRUNCATE)) return selectTrunc(I); @@ -3796,6 +4703,8 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectRet(I); case Instruction::FRem: return selectFRem(I); + case Instruction::GetElementPtr: + return selectGetElementPtr(I); } // fall-back to target-independent instruction selection.