X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64FastISel.cpp;h=c3f6859f510d6840396ef2564bd0eaf782793323;hp=826c4c089a8fafd9c1fa21c8968c421ba7ffc9e8;hb=8dd904ce60401e667ce2a4e7b32d35c5b9b5ea43;hpb=17e0ee5078173356241cb860e2408100ce85ddf4 diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 826c4c089a8..c3f6859f510 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64CallingConvention.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -78,11 +79,9 @@ class AArch64FastISel final : public FastISel { return Base.Reg; } void setOffsetReg(unsigned Reg) { - assert(isRegBase() && "Invalid offset register access!"); OffsetReg = Reg; } unsigned getOffsetReg() const { - assert(isRegBase() && "Invalid offset register access!"); return OffsetReg; } void setFI(unsigned FI) { @@ -133,6 +132,8 @@ private: bool selectShift(const Instruction *I); bool selectBitCast(const Instruction *I); bool selectFRem(const Instruction *I); + bool selectSDiv(const Instruction *I); + bool selectGetElementPtr(const Instruction *I); // Utility helper routines. bool isTypeLegal(Type *Ty, MVT &VT); @@ -149,6 +150,9 @@ private: unsigned Alignment); bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I, const Value *Cond); + bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT); + bool optimizeSelect(const SelectInst *SI); + std::pair getRegForGEPIndex(const Value *Idx); // Emit helper routines. unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, @@ -172,12 +176,13 @@ private: bool WantResult = true); // Emit functions. + bool emitCompareAndBranch(const BranchInst *BI); bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt); bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt); bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS); - bool emitLoad(MVT VT, unsigned &ResultReg, Address Addr, - MachineMemOperand *MMO = nullptr); + unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true, + MachineMemOperand *MMO = nullptr); bool emitStore(MVT VT, unsigned SrcReg, Address Addr, MachineMemOperand *MMO = nullptr); unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); @@ -185,6 +190,7 @@ private: unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); + unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm); unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags = false, bool WantResult = true, bool IsZExt = false); @@ -239,9 +245,10 @@ public: unsigned fastMaterializeFloatZero(const ConstantFP* CF) override; explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo, - const TargetLibraryInfo *LibInfo) + const TargetLibraryInfo *LibInfo) : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { - Subtarget = &TM.getSubtarget(); + Subtarget = + &static_cast(FuncInfo.MF->getSubtarget()); Context = &FuncInfo.Fn->getContext(); } @@ -254,9 +261,50 @@ public: #include "AArch64GenCallingConv.inc" +/// \brief Check if the sign-/zero-extend will be a noop. +static bool isIntExtFree(const Instruction *I) { + assert((isa(I) || isa(I)) && + "Unexpected integer extend instruction."); + assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() && + "Unexpected value type."); + bool IsZExt = isa(I); + + if (const auto *LI = dyn_cast(I->getOperand(0))) + if (LI->hasOneUse()) + return true; + + if (const auto *Arg = dyn_cast(I->getOperand(0))) + if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) + return true; + + return false; +} + +/// \brief Determine the implicit scale factor that is applied by a memory +/// operation for a given value type. +static unsigned getImplicitScaleFactor(MVT VT) { + switch (VT.SimpleTy) { + default: + return 0; // invalid + case MVT::i1: // fall-through + case MVT::i8: + return 1; + case MVT::i16: + return 2; + case MVT::i32: // fall-through + case MVT::f32: + return 4; + case MVT::i64: // fall-through + case MVT::f64: + return 8; + } +} + CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { if (CC == CallingConv::WebKit_JS) return CC_AArch64_WebKit_JS; + if (CC == CallingConv::GHC) + return CC_AArch64_GHC; return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS; } @@ -322,6 +370,24 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); } + // For the MachO large code model materialize the FP constant in code. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; + const TargetRegisterClass *RC = Is64Bit ? + &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + unsigned TmpReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg) + .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(TmpReg, getKillRegState(true)); + + return ResultReg; + } + // Materialize via constant pool. MachineConstantPool wants an explicit // alignment. unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); @@ -425,6 +491,19 @@ unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) { return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true); } +/// \brief Check if the multiply is by a power-of-2 constant. +static bool isMulPowOf2(const Value *I) { + if (const auto *MI = dyn_cast(I)) { + if (const auto *C = dyn_cast(MI->getOperand(0))) + if (C->getValue().isPowerOf2()) + return true; + if (const auto *C = dyn_cast(MI->getOperand(1))) + if (C->getValue().isPowerOf2()) + return true; + } + return false; +} + // Computes the address to get to an object. bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) { @@ -536,7 +615,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) std::swap(LHS, RHS); if (const ConstantInt *CI = dyn_cast(RHS)) { - Addr.setOffset(Addr.getOffset() + (uint64_t)CI->getSExtValue()); + Addr.setOffset(Addr.getOffset() + CI->getSExtValue()); return computeAddress(LHS, Addr, Ty); } @@ -547,66 +626,229 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) break; } - case Instruction::Shl: + case Instruction::Sub: { + // Subs of constants are common and easy enough. + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const ConstantInt *CI = dyn_cast(RHS)) { + Addr.setOffset(Addr.getOffset() - CI->getSExtValue()); + return computeAddress(LHS, Addr, Ty); + } + break; + } + case Instruction::Shl: { if (Addr.getOffsetReg()) break; - if (const auto *CI = dyn_cast(U->getOperand(1))) { - unsigned Val = CI->getZExtValue(); - if (Val < 1 || Val > 3) - break; + const auto *CI = dyn_cast(U->getOperand(1)); + if (!CI) + break; - uint64_t NumBytes = 0; - if (Ty && Ty->isSized()) { - uint64_t NumBits = DL.getTypeSizeInBits(Ty); - NumBytes = NumBits / 8; - if (!isPowerOf2_64(NumBits)) - NumBytes = 0; - } + unsigned Val = CI->getZExtValue(); + if (Val < 1 || Val > 3) + break; - if (NumBytes != (1ULL << Val)) - break; + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; - Addr.setShift(Val); - Addr.setExtendType(AArch64_AM::LSL); + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); - if (const auto *I = dyn_cast(U->getOperand(0))) - if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) - U = I; + const Value *Src = U->getOperand(0); + if (const auto *I = dyn_cast(Src)) + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + Src = I; - if (const auto *ZE = dyn_cast(U)) - if (ZE->getOperand(0)->getType()->isIntegerTy(32)) + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast(Src)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast(Src)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } - if (const auto *SE = dyn_cast(U)) - if (SE->getOperand(0)->getType()->isIntegerTy(32)) - Addr.setExtendType(AArch64_AM::SXTW); + if (const auto *AI = dyn_cast(Src)) + if (AI->getOpcode() == Instruction::And) { + const Value *LHS = AI->getOperand(0); + const Value *RHS = AI->getOperand(1); + + if (const auto *C = dyn_cast(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setExtendType(AArch64_AM::UXTW); + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } + } - unsigned Reg = getRegForValue(U->getOperand(0)); - if (!Reg) - return false; - Addr.setOffsetReg(Reg); - return true; + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::Mul: { + if (Addr.getOffsetReg()) + break; + + if (!isMulPowOf2(U)) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + // Canonicalize power-of-2 value to the RHS. + if (const auto *C = dyn_cast(LHS)) + if (C->getValue().isPowerOf2()) + std::swap(LHS, RHS); + + assert(isa(RHS) && "Expected an ConstantInt."); + const auto *C = cast(RHS); + unsigned Val = C->getValue().logBase2(); + if (Val < 1 || Val > 3) + break; + + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; + + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); + + const Value *Src = LHS; + if (const auto *I = dyn_cast(Src)) + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + Src = I; + + + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast(Src)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast(Src)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } } + + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::And: { + if (Addr.getOffsetReg()) + break; + + if (!Ty || DL.getTypeSizeInBits(Ty) != 8) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const auto *C = dyn_cast(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setShift(0); + Addr.setExtendType(AArch64_AM::LSL); + Addr.setExtendType(AArch64_AM::UXTW); + + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } break; } + case Instruction::SExt: + case Instruction::ZExt: { + if (!Addr.getReg() || Addr.getOffsetReg()) + break; - if (Addr.getReg()) { - if (!Addr.getOffsetReg()) { - unsigned Reg = getRegForValue(Obj); - if (!Reg) - return false; - Addr.setOffsetReg(Reg); - return true; + const Value *Src = nullptr; + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast(U)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast(U)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } } - return false; + + if (!Src) + break; + + Addr.setShift(0); + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; } + } // end switch - unsigned Reg = getRegForValue(Obj); - if (!Reg) - return false; - Addr.setReg(Reg); - return true; + if (Addr.isRegBase() && !Addr.getReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setReg(Reg); + return true; + } + + if (!Addr.getOffsetReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + + return false; } bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { @@ -707,17 +949,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const { } bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { - unsigned ScaleFactor; - switch (VT.SimpleTy) { - default: return false; - case MVT::i1: // fall-through - case MVT::i8: ScaleFactor = 1; break; - case MVT::i16: ScaleFactor = 2; break; - case MVT::i32: // fall-through - case MVT::f32: ScaleFactor = 4; break; - case MVT::i64: // fall-through - case MVT::f64: ScaleFactor = 8; break; - } + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + return false; bool ImmediateOffsetNeedsLowering = false; bool RegisterOffsetNeedsLowering = false; @@ -731,8 +965,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Cannot encode an offset register and an immediate offset in the same // instruction. Fold the immediate offset into the load/store instruction and // emit an additonal add to take care of the offset register. - if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.isRegBase() && - Addr.getOffsetReg()) + if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) RegisterOffsetNeedsLowering = true; // Cannot encode zero register as base. @@ -742,7 +975,8 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // If this is a stack pointer and the offset needs to be simplified then put // the alloca address into a register, set the base type back to register and // continue. This should almost never happen. - if (ImmediateOffsetNeedsLowering && Addr.isFIBase()) { + if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase()) + { unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) @@ -792,10 +1026,10 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Since the offset is too large for the load/store instruction get the // reg+offset into a register. if (ImmediateOffsetNeedsLowering) { - unsigned ResultReg = 0; + unsigned ResultReg; if (Addr.getReg()) - ResultReg = fastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), - /*IsKill=*/false, Offset, MVT::i64); + // Try to fold the immediate into the add instruction. + ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset); else ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset); @@ -839,10 +1073,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, MIB.addReg(Addr.getOffsetReg()); MIB.addImm(IsSigned); MIB.addImm(Addr.getShift() != 0); - } else { - MIB.addReg(Addr.getReg()); - MIB.addImm(Offset); - } + } else + MIB.addReg(Addr.getReg()).addImm(Offset); } if (MMO) @@ -876,11 +1108,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32); // Canonicalize immediates to the RHS first. - if (UseAdd && isa(LHS) && !isa(RHS)) + if (UseAdd && isa(LHS) && !isa(RHS)) std::swap(LHS, RHS); + // Canonicalize mul by power of 2 to the RHS. + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + // Canonicalize shift immediate to the RHS. - if (UseAdd && isValueAvailable(LHS)) + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) if (const auto *SI = dyn_cast(LHS)) if (isa(SI->getOperand(1))) if (SI->getOpcode() == Instruction::Shl || @@ -905,12 +1142,17 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, else ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags, WantResult); - } + } else if (const auto *C = dyn_cast(RHS)) + if (C->isNullValue()) + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags, + WantResult); + if (ResultReg) return ResultReg; // Only extend the RHS within the instruction if there is a valid extend type. - if (ExtendType != AArch64_AM::InvalidShiftExtend && isValueAvailable(RHS)) { + if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() && + isValueAvailable(RHS)) { if (const auto *SI = dyn_cast(RHS)) if (const auto *C = dyn_cast(SI->getOperand(1))) if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { @@ -930,8 +1172,28 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, ExtendType, 0, SetFlags, WantResult); } + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast(RHS)->getOperand(0); + const Value *MulRHS = cast(RHS)->getOperand(1); + + if (const auto *C = dyn_cast(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast(MulRHS)->getValue().logBase2(); + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + AArch64_AM::LSL, ShiftVal, SetFlags, WantResult); + } + // Check if the shift can be folded into the instruction. - if (isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) if (const auto *SI = dyn_cast(RHS)) { if (const auto *C = dyn_cast(SI->getOperand(1))) { AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; @@ -1197,6 +1459,30 @@ unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, IsZExt); } +/// \brief This method is a wrapper to simplify add emission. +/// +/// First try to emit an add with an immediate operand using emitAddSub_ri. If +/// that fails, then try to materialize the immediate into a register and use +/// emitAddSub_rr instead. +unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, + int64_t Imm) { + unsigned ResultReg; + if (Imm < 0) + ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm); + else + ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm); + + if (ResultReg) + return ResultReg; + + unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm); + if (!CReg) + return 0; + + ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true); + return ResultReg; +} + unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS, bool SetFlags, bool WantResult, bool IsZExt) { return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult, @@ -1226,12 +1512,16 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (isa(LHS) && !isa(RHS)) std::swap(LHS, RHS); + // Canonicalize mul by power-of-2 to the RHS. + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + // Canonicalize shift immediate to the RHS. - if (isValueAvailable(LHS)) - if (const auto *SI = dyn_cast(LHS)) + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (const auto *SI = dyn_cast(LHS)) if (isa(SI->getOperand(1))) - if (SI->getOpcode() == Instruction::Shl) - std::swap(LHS, RHS); + std::swap(LHS, RHS); unsigned LHSReg = getRegForValue(LHS); if (!LHSReg) @@ -1246,19 +1536,39 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (ResultReg) return ResultReg; + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast(RHS)->getOperand(0); + const Value *MulRHS = cast(RHS)->getOperand(1); + + if (const auto *C = dyn_cast(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast(MulRHS)->getValue().logBase2(); + + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + } + // Check if the shift can be folded into the instruction. - if (isValueAvailable(RHS)) - if (const auto *SI = dyn_cast(RHS)) - if (const auto *C = dyn_cast(SI->getOperand(1))) - if (SI->getOpcode() == Instruction::Shl) { - uint64_t ShiftVal = C->getZExtValue(); - unsigned RHSReg = getRegForValue(SI->getOperand(0)); - if (!RHSReg) - return 0; - bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); - } + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (const auto *SI = dyn_cast(RHS)) + if (const auto *C = dyn_cast(SI->getOperand(1))) { + uint64_t ShiftVal = C->getZExtValue(); + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1363,23 +1673,15 @@ unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm); } -bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr, - MachineMemOperand *MMO) { +unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, + bool WantZExt, MachineMemOperand *MMO) { // Simplify this down to something we can handle. if (!simplifyAddress(Addr, VT)) - return false; + return 0; - unsigned ScaleFactor; - switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected value type."); - case MVT::i1: // fall-through - case MVT::i8: ScaleFactor = 1; break; - case MVT::i16: ScaleFactor = 2; break; - case MVT::i32: // fall-through - case MVT::f32: ScaleFactor = 4; break; - case MVT::i64: // fall-through - case MVT::f64: ScaleFactor = 8; break; - } + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); // Negative offsets require unscaled, 9-bit, signed immediate offsets. // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. @@ -1389,20 +1691,54 @@ bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr, ScaleFactor = 1; } - static const unsigned OpcTable[4][6] = { - { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, AArch64::LDURXi, - AArch64::LDURSi, AArch64::LDURDi }, - { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, AArch64::LDRXui, - AArch64::LDRSui, AArch64::LDRDui }, - { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, AArch64::LDRXroX, - AArch64::LDRSroX, AArch64::LDRDroX }, - { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, AArch64::LDRXroW, - AArch64::LDRSroW, AArch64::LDRDroW } + static const unsigned GPOpcTable[2][8][4] = { + // Sign-extend. + { { AArch64::LDURSBWi, AArch64::LDURSHWi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURSBXi, AArch64::LDURSHXi, AArch64::LDURSWi, + AArch64::LDURXi }, + { AArch64::LDRSBWui, AArch64::LDRSHWui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRSBXui, AArch64::LDRSHXui, AArch64::LDRSWui, + AArch64::LDRXui }, + { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW, + AArch64::LDRXroW } + }, + // Zero-extend. + { { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW } + } + }; + + static const unsigned FPOpcTable[4][2] = { + { AArch64::LDURSi, AArch64::LDURDi }, + { AArch64::LDRSui, AArch64::LDRDui }, + { AArch64::LDRSroX, AArch64::LDRDroX }, + { AArch64::LDRSroW, AArch64::LDRDroW } }; unsigned Opc; const TargetRegisterClass *RC; - bool VTIsi1 = false; bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() && Addr.getOffsetReg(); unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0; @@ -1410,30 +1746,65 @@ bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr, Addr.getExtendType() == AArch64_AM::SXTW) Idx++; + bool IsRet64Bit = RetVT == MVT::i64; switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected value type."); - case MVT::i1: VTIsi1 = true; // Intentional fall-through. - case MVT::i8: Opc = OpcTable[Idx][0]; RC = &AArch64::GPR32RegClass; break; - case MVT::i16: Opc = OpcTable[Idx][1]; RC = &AArch64::GPR32RegClass; break; - case MVT::i32: Opc = OpcTable[Idx][2]; RC = &AArch64::GPR32RegClass; break; - case MVT::i64: Opc = OpcTable[Idx][3]; RC = &AArch64::GPR64RegClass; break; - case MVT::f32: Opc = OpcTable[Idx][4]; RC = &AArch64::FPR32RegClass; break; - case MVT::f64: Opc = OpcTable[Idx][5]; RC = &AArch64::FPR64RegClass; break; + default: + llvm_unreachable("Unexpected value type."); + case MVT::i1: // Intentional fall-through. + case MVT::i8: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i16: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i32: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i64: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3]; + RC = &AArch64::GPR64RegClass; + break; + case MVT::f32: + Opc = FPOpcTable[Idx][0]; + RC = &AArch64::FPR32RegClass; + break; + case MVT::f64: + Opc = FPOpcTable[Idx][1]; + RC = &AArch64::FPR64RegClass; + break; } // Create the base instruction, then add the operands. - ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO); // Loading an i1 requires special handling. - if (VTIsi1) { + if (VT == MVT::i1) { unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1); assert(ANDReg && "Unexpected AND instruction emission failure."); ResultReg = ANDReg; } - return true; + + // For zero-extending loads to 64bit we emit a 32bit load and then convert + // the 32bit reg to a 64bit reg. + if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(ResultReg, getKillRegState(true)) + .addImm(AArch64::sub_32); + ResultReg = Reg64; + } + return ResultReg; } bool AArch64FastISel::selectAddSub(const Instruction *I) { @@ -1505,10 +1876,83 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { if (!computeAddress(I->getOperand(0), Addr, I->getType())) return false; - unsigned ResultReg; - if (!emitLoad(VT, ResultReg, Addr, createMachineMemOperandFor(I))) + // Fold the following sign-/zero-extend into the load instruction. + bool WantZExt = true; + MVT RetVT = VT; + const Value *IntExtVal = nullptr; + if (I->hasOneUse()) { + if (const auto *ZE = dyn_cast(I->use_begin()->getUser())) { + if (isTypeSupported(ZE->getType(), RetVT)) + IntExtVal = ZE; + else + RetVT = VT; + } else if (const auto *SE = dyn_cast(I->use_begin()->getUser())) { + if (isTypeSupported(SE->getType(), RetVT)) + IntExtVal = SE; + else + RetVT = VT; + WantZExt = false; + } + } + + unsigned ResultReg = + emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I)); + if (!ResultReg) return false; + // There are a few different cases we have to handle, because the load or the + // sign-/zero-extend might not be selected by FastISel if we fall-back to + // SelectionDAG. There is also an ordering issue when both instructions are in + // different basic blocks. + // 1.) The load instruction is selected by FastISel, but the integer extend + // not. This usually happens when the integer extend is in a different + // basic block and SelectionDAG took over for that basic block. + // 2.) The load instruction is selected before the integer extend. This only + // happens when the integer extend is in a different basic block. + // 3.) The load instruction is selected by SelectionDAG and the integer extend + // by FastISel. This happens if there are instructions between the load + // and the integer extend that couldn't be selected by FastISel. + if (IntExtVal) { + // The integer extend hasn't been emitted yet. FastISel or SelectionDAG + // could select it. Emit a copy to subreg if necessary. FastISel will remove + // it when it selects the integer extend. + unsigned Reg = lookUpRegForValue(IntExtVal); + auto *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) { + if (RetVT == MVT::i64 && VT <= MVT::i32) { + if (WantZExt) { + // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG). + std::prev(FuncInfo.InsertPt)->eraseFromParent(); + ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg(); + } else + ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, + /*IsKill=*/true, + AArch64::sub_32); + } + updateValueMap(I, ResultReg); + return true; + } + + // The integer extend has already been emitted - delete all the instructions + // that have been emitted by the integer extend lowering code and use the + // result from the load instruction directly. + while (MI) { + Reg = 0; + for (auto &Opnd : MI->uses()) { + if (Opnd.isReg()) { + Reg = Opnd.getReg(); + break; + } + } + MI->eraseFromParent(); + MI = nullptr; + if (Reg) + MI = MRI.getUniqueVRegDef(Reg); + } + updateValueMap(IntExtVal, ResultReg); + return true; + } + updateValueMap(I, ResultReg); return true; } @@ -1519,17 +1963,9 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, if (!simplifyAddress(Addr, VT)) return false; - unsigned ScaleFactor; - switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected value type."); - case MVT::i1: // fall-through - case MVT::i8: ScaleFactor = 1; break; - case MVT::i16: ScaleFactor = 2; break; - case MVT::i32: // fall-through - case MVT::f32: ScaleFactor = 4; break; - case MVT::i64: // fall-through - case MVT::f64: ScaleFactor = 8; break; - } + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); // Negative offsets require unscaled, 9-bit, signed immediate offsets. // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. @@ -1539,7 +1975,6 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, ScaleFactor = 1; } - static const unsigned OpcTable[4][6] = { { AArch64::STURBBi, AArch64::STURHHi, AArch64::STURWi, AArch64::STURXi, AArch64::STURSi, AArch64::STURDi }, @@ -1549,7 +1984,6 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, AArch64::STRSroX, AArch64::STRDroX }, { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW, AArch64::STRSroW, AArch64::STRDroW } - }; unsigned Opc; @@ -1673,6 +2107,134 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { } } +/// \brief Try to emit a combined compare-and-branch instruction. +bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { + assert(isa(BI->getCondition()) && "Expected cmp instruction"); + const CmpInst *CI = cast(BI->getCondition()); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + MVT VT; + if (!isTypeSupported(LHS->getType(), VT)) + return false; + + unsigned BW = VT.getSizeInBits(); + if (BW > 64) + return false; + + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + int TestBit = -1; + bool IsCmpNE; + switch (Predicate) { + default: + return false; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_NE: + if (isa(LHS) && cast(LHS)->isNullValue()) + std::swap(LHS, RHS); + + if (!isa(RHS) || !cast(RHS)->isNullValue()) + return false; + + if (const auto *AI = dyn_cast(LHS)) + if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) { + const Value *AndLHS = AI->getOperand(0); + const Value *AndRHS = AI->getOperand(1); + + if (const auto *C = dyn_cast(AndLHS)) + if (C->getValue().isPowerOf2()) + std::swap(AndLHS, AndRHS); + + if (const auto *C = dyn_cast(AndRHS)) + if (C->getValue().isPowerOf2()) { + TestBit = C->getValue().logBase2(); + LHS = AndLHS; + } + } + + if (VT == MVT::i1) + TestBit = 0; + + IsCmpNE = Predicate == CmpInst::ICMP_NE; + break; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + if (!isa(RHS) || !cast(RHS)->isNullValue()) + return false; + + TestBit = BW - 1; + IsCmpNE = Predicate == CmpInst::ICMP_SLT; + break; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SLE: + if (!isa(RHS)) + return false; + + if (cast(RHS)->getValue() != APInt(BW, -1, true)) + return false; + + TestBit = BW - 1; + IsCmpNE = Predicate == CmpInst::ICMP_SLE; + break; + } // end switch + + static const unsigned OpcTable[2][2][2] = { + { {AArch64::CBZW, AArch64::CBZX }, + {AArch64::CBNZW, AArch64::CBNZX} }, + { {AArch64::TBZW, AArch64::TBZX }, + {AArch64::TBNZW, AArch64::TBNZX} } + }; + + bool IsBitTest = TestBit != -1; + bool Is64Bit = BW == 64; + if (TestBit < 32 && TestBit >= 0) + Is64Bit = false; + + unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit]; + const MCInstrDesc &II = TII.get(Opc); + + unsigned SrcReg = getRegForValue(LHS); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(LHS); + + if (BW == 64 && !Is64Bit) + SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, + AArch64::sub_32); + + if ((BW < 32) && !IsBitTest) + SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true); + + // Emit the combined compare and branch instruction. + SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + if (IsBitTest) + MIB.addImm(TestBit); + MIB.addMBB(TBB); + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + fastEmitBranch(FBB, DbgLoc); + + return true; +} + bool AArch64FastISel::selectBranch(const Instruction *I) { const BranchInst *BI = cast(I); if (BI->isUnconditional()) { @@ -1700,6 +2262,10 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { return true; } + // Try to emit a combined compare-and-branch first. + if (emitCompareAndBranch(BI)) + return true; + // Try to take advantage of fallthrough opportunities. if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { std::swap(TBB, FBB); @@ -1956,60 +2522,186 @@ bool AArch64FastISel::selectCmp(const Instruction *I) { return true; } -bool AArch64FastISel::selectSelect(const Instruction *I) { - const SelectInst *SI = cast(I); +/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false' +/// value. +bool AArch64FastISel::optimizeSelect(const SelectInst *SI) { + if (!SI->getType()->isIntegerTy(1)) + return false; - EVT DestEVT = TLI.getValueType(SI->getType(), true); - if (!DestEVT.isSimple()) + const Value *Src1Val, *Src2Val; + unsigned Opc = 0; + bool NeedExtraOp = false; + if (auto *CI = dyn_cast(SI->getTrueValue())) { + if (CI->isOne()) { + Src1Val = SI->getCondition(); + Src2Val = SI->getFalseValue(); + Opc = AArch64::ORRWrr; + } else { + assert(CI->isZero()); + Src1Val = SI->getFalseValue(); + Src2Val = SI->getCondition(); + Opc = AArch64::BICWrr; + } + } else if (auto *CI = dyn_cast(SI->getFalseValue())) { + if (CI->isOne()) { + Src1Val = SI->getCondition(); + Src2Val = SI->getTrueValue(); + Opc = AArch64::ORRWrr; + NeedExtraOp = true; + } else { + assert(CI->isZero()); + Src1Val = SI->getCondition(); + Src2Val = SI->getTrueValue(); + Opc = AArch64::ANDWrr; + } + } + + if (!Opc) return false; - MVT DestVT = DestEVT.getSimpleVT(); - if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 && - DestVT != MVT::f64) + unsigned Src1Reg = getRegForValue(Src1Val); + if (!Src1Reg) return false; + bool Src1IsKill = hasTrivialKill(Src1Val); - unsigned SelectOpc; - const TargetRegisterClass *RC = nullptr; - switch (DestVT.SimpleTy) { - default: return false; + unsigned Src2Reg = getRegForValue(Src2Val); + if (!Src2Reg) + return false; + bool Src2IsKill = hasTrivialKill(Src2Val); + + if (NeedExtraOp) { + Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1); + Src1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32spRegClass, Src1Reg, + Src1IsKill, Src2Reg, Src2IsKill); + updateValueMap(SI, ResultReg); + return true; +} + +bool AArch64FastISel::selectSelect(const Instruction *I) { + assert(isa(I) && "Expected a select instruction."); + MVT VT; + if (!isTypeSupported(I->getType(), VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: case MVT::i32: - SelectOpc = AArch64::CSELWr; RC = &AArch64::GPR32RegClass; break; + Opc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; + break; case MVT::i64: - SelectOpc = AArch64::CSELXr; RC = &AArch64::GPR64RegClass; break; + Opc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; + break; case MVT::f32: - SelectOpc = AArch64::FCSELSrrr; RC = &AArch64::FPR32RegClass; break; + Opc = AArch64::FCSELSrrr; + RC = &AArch64::FPR32RegClass; + break; case MVT::f64: - SelectOpc = AArch64::FCSELDrrr; RC = &AArch64::FPR64RegClass; break; + Opc = AArch64::FCSELDrrr; + RC = &AArch64::FPR64RegClass; + break; } + const SelectInst *SI = cast(I); const Value *Cond = SI->getCondition(); - bool NeedTest = true; AArch64CC::CondCode CC = AArch64CC::NE; - if (foldXALUIntrinsic(CC, I, Cond)) - NeedTest = false; + AArch64CC::CondCode ExtraCC = AArch64CC::AL; - unsigned CondReg = getRegForValue(Cond); - if (!CondReg) - return false; - bool CondIsKill = hasTrivialKill(Cond); + if (optimizeSelect(SI)) + return true; - if (NeedTest) { - unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1); - assert(ANDReg && "Unexpected AND instruction emission failure."); - emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0); + // Try to pickup the flags, so we don't have to emit another compare. + if (foldXALUIntrinsic(CC, I, Cond)) { + // Fake request the condition to force emission of the XALU intrinsic. + unsigned CondReg = getRegForValue(Cond); + if (!CondReg) + return false; + } else if (isa(Cond) && cast(Cond)->hasOneUse() && + isValueAvailable(Cond)) { + const auto *Cmp = cast(Cond); + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp); + const Value *FoldSelect = nullptr; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + FoldSelect = SI->getFalseValue(); + break; + case CmpInst::FCMP_TRUE: + FoldSelect = SI->getTrueValue(); + break; + } + + if (FoldSelect) { + unsigned SrcReg = getRegForValue(FoldSelect); + if (!SrcReg) + return false; + unsigned UseReg = lookUpRegForValue(SI); + if (UseReg) + MRI.clearKillFlags(UseReg); + + updateValueMap(I, SrcReg); + return true; + } + + // Emit the cmp. + if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned())) + return false; + + // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction. + CC = getCompareCC(Predicate); + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + ExtraCC = AArch64CC::EQ; + CC = AArch64CC::VS; + break; + case CmpInst::FCMP_ONE: + ExtraCC = AArch64CC::MI; + CC = AArch64CC::GT; + break; + } + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + } else { + unsigned CondReg = getRegForValue(Cond); + if (!CondReg) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + // Emit a TST instruction (ANDS wzr, reg, #imm). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDSWri), + AArch64::WZR) + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); } - unsigned TrueReg = getRegForValue(SI->getTrueValue()); - bool TrueIsKill = hasTrivialKill(SI->getTrueValue()); + unsigned Src1Reg = getRegForValue(SI->getTrueValue()); + bool Src1IsKill = hasTrivialKill(SI->getTrueValue()); - unsigned FalseReg = getRegForValue(SI->getFalseValue()); - bool FalseIsKill = hasTrivialKill(SI->getFalseValue()); + unsigned Src2Reg = getRegForValue(SI->getFalseValue()); + bool Src2IsKill = hasTrivialKill(SI->getFalseValue()); - if (!TrueReg || !FalseReg) + if (!Src1Reg || !Src2Reg) return false; - unsigned ResultReg = fastEmitInst_rri(SelectOpc, RC, TrueReg, TrueIsKill, - FalseReg, FalseIsKill, CC); + if (ExtraCC != AArch64CC::AL) { + Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, + Src2IsKill, ExtraCC); + Src2IsKill = true; + } + unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, + Src2IsKill, CC); updateValueMap(I, ResultReg); return true; } @@ -2343,6 +3035,11 @@ bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, // Copy all of the result registers out of their specified physreg. MVT CopyVT = RVLocs[0].getValVT(); + + // TODO: Handle big-endian results + if (CopyVT.isVector() && !Subtarget->isLittleEndian()) + return false; + unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) @@ -2467,7 +3164,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); CLI.Call = MIB; @@ -2515,14 +3212,11 @@ bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src, } } - bool RV; - unsigned ResultReg; - RV = emitLoad(VT, ResultReg, Src); - if (!RV) + unsigned ResultReg = emitLoad(VT, VT, Src); + if (!ResultReg) return false; - RV = emitStore(VT, ResultReg, Dest); - if (!RV) + if (!emitStore(VT, ResultReg, Dest)) return false; int64_t Size = VT.getSizeInBits() / 8; @@ -2560,15 +3254,49 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, if (RetVT != MVT::i32 && RetVT != MVT::i64) return false; + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa(LHS) && !isa(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + // Simplify multiplies. + unsigned IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::sadd_with_overflow; + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::uadd_with_overflow; + break; + } + AArch64CC::CondCode TmpCC; - switch (II->getIntrinsicID()) { - default: return false; - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: TmpCC = AArch64CC::VS; break; - case Intrinsic::uadd_with_overflow: TmpCC = AArch64CC::HS; break; - case Intrinsic::usub_with_overflow: TmpCC = AArch64CC::LO; break; - case Intrinsic::smul_with_overflow: - case Intrinsic::umul_with_overflow: TmpCC = AArch64CC::NE; break; + switch (IID) { + default: + return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + TmpCC = AArch64CC::VS; + break; + case Intrinsic::uadd_with_overflow: + TmpCC = AArch64CC::HS; + break; + case Intrinsic::usub_with_overflow: + TmpCC = AArch64CC::LO; + break; + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + TmpCC = AArch64CC::NE; + break; } // Check if both instructions are in the same basic block. @@ -2603,8 +3331,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { MFI->setFrameAddressIsTaken(true); const AArch64RegisterInfo *RegInfo = - static_cast( - TM.getSubtargetImpl()->getRegisterInfo()); + static_cast(Subtarget->getRegisterInfo()); unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -2728,6 +3455,32 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { updateValueMap(II, CLI.ResultReg); return true; } + case Intrinsic::fabs: { + MVT VT; + if (!isTypeLegal(II->getType(), VT)) + return false; + + unsigned Opc; + switch (VT.SimpleTy) { + default: + return false; + case MVT::f32: + Opc = AArch64::FABSSr; + break; + case MVT::f64: + Opc = AArch64::FABSDr; + break; + } + unsigned SrcReg = getRegForValue(II->getOperand(0)); + if (!SrcReg) + return false; + bool SrcRegIsKill = hasTrivialKill(II->getOperand(0)); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg, getKillRegState(SrcRegIsKill)); + updateValueMap(II, ResultReg); + return true; + } case Intrinsic::trap: { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) .addImm(1); @@ -2777,9 +3530,30 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { isCommutativeIntrinsic(II)) std::swap(LHS, RHS); + // Simplify multiplies. + unsigned IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::sadd_with_overflow; + RHS = LHS; + } + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::uadd_with_overflow; + RHS = LHS; + } + break; + } + unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0; AArch64CC::CondCode CC = AArch64CC::Invalid; - switch (II->getIntrinsicID()) { + switch (IID) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true); @@ -2869,6 +3643,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass, AArch64::WZR, /*IsKill=*/true, AArch64::WZR, /*IsKill=*/true, getInvertedCondCode(CC)); + (void)ResultReg2; assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers."); updateValueMap(II, ResultReg1, 2); @@ -3139,11 +3914,12 @@ unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, bool Op0IsKill, uint64_t Shift, - bool IsZext) { + bool IsZExt) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); - assert((SrcVT == MVT::i8 || SrcVT == MVT::i16 || SrcVT == MVT::i32 || - SrcVT == MVT::i64) && "Unexpected source value type."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || RetVT == MVT::i64) && "Unexpected return value type."); @@ -3151,6 +3927,20 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, unsigned RegSize = Is64Bit ? 64 : 32; unsigned DstBits = RetVT.getSizeInBits(); unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } // Don't deal with undefined shifts. if (Shift >= DstBits) @@ -3188,9 +3978,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, {AArch64::SBFMWri, AArch64::SBFMXri}, {AArch64::UBFMWri, AArch64::UBFMXri} }; - unsigned Opc = OpcTable[IsZext][Is64Bit]; - const TargetRegisterClass *RC = - Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { unsigned TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3236,8 +4024,9 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, bool IsZExt) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); - assert((SrcVT == MVT::i8 || SrcVT == MVT::i16 || SrcVT == MVT::i32 || - SrcVT == MVT::i64) && "Unexpected source value type."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || RetVT == MVT::i64) && "Unexpected return value type."); @@ -3245,6 +4034,20 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, unsigned RegSize = Is64Bit ? 64 : 32; unsigned DstBits = RetVT.getSizeInBits(); unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } // Don't deal with undefined shifts. if (Shift >= DstBits) @@ -3297,8 +4100,6 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, {AArch64::UBFMWri, AArch64::UBFMXri} }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; - const TargetRegisterClass *RC = - Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { unsigned TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3344,8 +4145,9 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, bool IsZExt) { assert(RetVT.SimpleTy >= SrcVT.SimpleTy && "Unexpected source/return type pair."); - assert((SrcVT == MVT::i8 || SrcVT == MVT::i16 || SrcVT == MVT::i32 || - SrcVT == MVT::i64) && "Unexpected source value type."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || RetVT == MVT::i64) && "Unexpected return value type."); @@ -3353,6 +4155,20 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, unsigned RegSize = Is64Bit ? 64 : 32; unsigned DstBits = RetVT.getSizeInBits(); unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } // Don't deal with undefined shifts. if (Shift >= DstBits) @@ -3393,8 +4209,6 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, {AArch64::UBFMWri, AArch64::UBFMXri} }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; - const TargetRegisterClass *RC = - Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { unsigned TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3469,48 +4283,154 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm); } -bool AArch64FastISel::selectIntExt(const Instruction *I) { - // On ARM, in general, integer casts don't involve legal types; this code - // handles promotable integers. The high bits for a type smaller than - // the register size are assumed to be undefined. - Type *DestTy = I->getType(); - Value *Src = I->getOperand(0); - Type *SrcTy = Src->getType(); +static bool isZExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: + return false; + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRBBroX: + case AArch64::LDRHHroX: + case AArch64::LDRWroX: + case AArch64::LDRBBroW: + case AArch64::LDRHHroW: + case AArch64::LDRWroW: + return true; + } +} - unsigned SrcReg = getRegForValue(Src); - if (!SrcReg) +static bool isSExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: return false; + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::LDRSBXui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::LDRSBWroX: + case AArch64::LDRSHWroX: + case AArch64::LDRSBXroX: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSHWroW: + case AArch64::LDRSBXroW: + case AArch64::LDRSHXroW: + case AArch64::LDRSWroW: + return true; + } +} - EVT SrcEVT = TLI.getValueType(SrcTy, true); - EVT DestEVT = TLI.getValueType(DestTy, true); - if (!SrcEVT.isSimple()) +bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, + MVT SrcVT) { + const auto *LI = dyn_cast(I->getOperand(0)); + if (!LI || !LI->hasOneUse()) return false; - if (!DestEVT.isSimple()) + + // Check if the load instruction has already been selected. + unsigned Reg = lookUpRegForValue(LI); + if (!Reg) return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - MVT DestVT = DestEVT.getSimpleVT(); - unsigned ResultReg = 0; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) + return false; + + // Check if the correct load instruction has been emitted - SelectionDAG might + // have emitted a zero-extending load, but we need a sign-extending load. + bool IsZExt = isa(I); + const auto *LoadMI = MI; + if (LoadMI->getOpcode() == TargetOpcode::COPY && + LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) { + unsigned LoadReg = MI->getOperand(1).getReg(); + LoadMI = MRI.getUniqueVRegDef(LoadReg); + assert(LoadMI && "Expected valid instruction"); + } + if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI))) + return false; + + // Nothing to be done. + if (RetVT != MVT::i64 || SrcVT > MVT::i32) { + updateValueMap(I, Reg); + return true; + } + + if (IsZExt) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(Reg, getKillRegState(true)) + .addImm(AArch64::sub_32); + Reg = Reg64; + } else { + assert((MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getSubReg() == AArch64::sub_32) && + "Expected copy instruction"); + Reg = MI->getOperand(1).getReg(); + MI->eraseFromParent(); + } + updateValueMap(I, Reg); + return true; +} + +bool AArch64FastISel::selectIntExt(const Instruction *I) { + assert((isa(I) || isa(I)) && + "Unexpected integer extend instruction."); + MVT RetVT; + MVT SrcVT; + if (!isTypeSupported(I->getType(), RetVT)) + return false; + + if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT)) + return false; + + // Try to optimize already sign-/zero-extended values from load instructions. + if (optimizeIntExtLoad(I, RetVT, SrcVT)) + return true; + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(I->getOperand(0)); + // Try to optimize already sign-/zero-extended values from function arguments. bool IsZExt = isa(I); - // Check if it is an argument and if it is already zero/sign-extended. - if (const auto *Arg = dyn_cast(Src)) { + if (const auto *Arg = dyn_cast(I->getOperand(0))) { if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) { - if (DestVT == MVT::i64) { - ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); + if (RetVT == MVT::i64 && SrcVT != MVT::i64) { + unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), ResultReg) - .addImm(0) - .addReg(SrcReg) - .addImm(AArch64::sub_32); - } else - ResultReg = SrcReg; + .addImm(0) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(AArch64::sub_32); + SrcReg = ResultReg; + } + // Conservatively clear all kill flags from all uses, because we are + // replacing a sign-/zero-extend instruction at IR level with a nop at MI + // level. The result of the instruction at IR level might have been + // trivially dead, which is now not longer true. + unsigned UseReg = lookUpRegForValue(I); + if (UseReg) + MRI.clearKillFlags(UseReg); + + updateValueMap(I, SrcReg); + return true; } } - if (!ResultReg) - ResultReg = emitIntExt(SrcVT, SrcReg, DestVT, IsZExt); - + unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt); if (!ResultReg) return false; @@ -3565,15 +4485,58 @@ bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { } bool AArch64FastISel::selectMul(const Instruction *I) { - EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true); - if (!SrcEVT.isSimple()) + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - // Must be simple value type. Don't handle vectors. - if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && - SrcVT != MVT::i8) - return false; + if (VT.isVector()) + return selectBinaryOp(I, ISD::MUL); + + const Value *Src0 = I->getOperand(0); + const Value *Src1 = I->getOperand(1); + if (const auto *C = dyn_cast(Src0)) + if (C->getValue().isPowerOf2()) + std::swap(Src0, Src1); + + // Try to simplify to a shift instruction. + if (const auto *C = dyn_cast(Src1)) + if (C->getValue().isPowerOf2()) { + uint64_t ShiftVal = C->getValue().logBase2(); + MVT SrcVT = VT; + bool IsZExt = true; + if (const auto *ZExt = dyn_cast(Src0)) { + if (!isIntExtFree(ZExt)) { + MVT VT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = true; + Src0 = ZExt->getOperand(0); + } + } + } else if (const auto *SExt = dyn_cast(Src0)) { + if (!isIntExtFree(SExt)) { + MVT VT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = false; + Src0 = SExt->getOperand(0); + } + } + } + + unsigned Src0Reg = getRegForValue(Src0); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(Src0); + + unsigned ResultReg = + emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt); + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + } unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) @@ -3585,8 +4548,7 @@ bool AArch64FastISel::selectMul(const Instruction *I) { return false; bool Src1IsKill = hasTrivialKill(I->getOperand(1)); - unsigned ResultReg = - emitMul_rr(SrcVT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); + unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); if (!ResultReg) return false; @@ -3607,21 +4569,25 @@ bool AArch64FastISel::selectShift(const Instruction *I) { unsigned ResultReg = 0; uint64_t ShiftVal = C->getZExtValue(); MVT SrcVT = RetVT; - bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true; + bool IsZExt = I->getOpcode() != Instruction::AShr; const Value *Op0 = I->getOperand(0); if (const auto *ZExt = dyn_cast(Op0)) { - MVT TmpVT; - if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) { - SrcVT = TmpVT; - IsZExt = true; - Op0 = ZExt->getOperand(0); + if (!isIntExtFree(ZExt)) { + MVT TmpVT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = true; + Op0 = ZExt->getOperand(0); + } } } else if (const auto *SExt = dyn_cast(Op0)) { - MVT TmpVT; - if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) { - SrcVT = TmpVT; - IsZExt = false; - Op0 = SExt->getOperand(0); + if (!isIntExtFree(SExt)) { + MVT TmpVT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = false; + Op0 = SExt->getOperand(0); + } } } @@ -3758,6 +4724,169 @@ bool AArch64FastISel::selectFRem(const Instruction *I) { return true; } +bool AArch64FastISel::selectSDiv(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + if (!isa(I->getOperand(1))) + return selectBinaryOp(I, ISD::SDIV); + + const APInt &C = cast(I->getOperand(1))->getValue(); + if ((VT != MVT::i32 && VT != MVT::i64) || !C || + !(C.isPowerOf2() || (-C).isPowerOf2())) + return selectBinaryOp(I, ISD::SDIV); + + unsigned Lg2 = C.countTrailingZeros(); + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); + + if (cast(I)->isExact()) { + unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2); + if (!ResultReg) + return false; + updateValueMap(I, ResultReg); + return true; + } + + int64_t Pow2MinusOne = (1ULL << Lg2) - 1; + unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne); + if (!AddReg) + return false; + + // (Src0 < 0) ? Pow2 - 1 : 0; + if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0)) + return false; + + unsigned SelectOpc; + const TargetRegisterClass *RC; + if (VT == MVT::i64) { + SelectOpc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; + } else { + SelectOpc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; + } + unsigned SelectReg = + fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg, + Src0IsKill, AArch64CC::LT); + if (!SelectReg) + return false; + + // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also + // negate the result. + unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + unsigned ResultReg; + if (C.isNegative()) + ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true, + SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2); + else + ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We +/// have to duplicate it for AArch64, because otherwise we would fail during the +/// sign-extend emission. +std::pair AArch64FastISel::getRegForGEPIndex(const Value *Idx) { + unsigned IdxN = getRegForValue(Idx); + if (IdxN == 0) + // Unhandled operand. Halt "fast" selection and bail. + return std::pair(0, false); + + bool IdxNIsKill = hasTrivialKill(Idx); + + // If the index is smaller or larger than intptr_t, truncate or extend it. + MVT PtrVT = TLI.getPointerTy(); + EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); + if (IdxVT.bitsLT(PtrVT)) { + IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false); + IdxNIsKill = true; + } else if (IdxVT.bitsGT(PtrVT)) + llvm_unreachable("AArch64 FastISel doesn't support types larger than i64"); + return std::pair(IdxN, IdxNIsKill); +} + +/// This is mostly a copy of the existing FastISel GEP code, but we have to +/// duplicate it for AArch64, because otherwise we would bail out even for +/// simple cases. This is because the standard fastEmit functions don't cover +/// MUL at all and ADD is lowered very inefficientily. +bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { + unsigned N = getRegForValue(I->getOperand(0)); + if (!N) + return false; + bool NIsKill = hasTrivialKill(I->getOperand(0)); + + // Keep a running tab of the total offset to coalesce multiple N = N + Offset + // into a single N = N + TotalOffset. + uint64_t TotalOffs = 0; + Type *Ty = I->getOperand(0)->getType(); + MVT VT = TLI.getPointerTy(); + for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { + const Value *Idx = *OI; + if (auto *StTy = dyn_cast(Ty)) { + unsigned Field = cast(Idx)->getZExtValue(); + // N = N + Offset + if (Field) + TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); + Ty = StTy->getElementType(Field); + } else { + Ty = cast(Ty)->getElementType(); + // If this is a constant subscript, handle it quickly. + if (const auto *CI = dyn_cast(Idx)) { + if (CI->isZero()) + continue; + // N = N + Offset + TotalOffs += + DL.getTypeAllocSize(Ty) * cast(CI)->getSExtValue(); + continue; + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + NIsKill = true; + TotalOffs = 0; + } + + // N = N + Idx * ElementSize; + uint64_t ElementSize = DL.getTypeAllocSize(Ty); + std::pair Pair = getRegForGEPIndex(Idx); + unsigned IdxN = Pair.first; + bool IdxNIsKill = Pair.second; + if (!IdxN) + return false; + + if (ElementSize != 1) { + unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize); + if (!C) + return false; + IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true); + if (!IdxN) + return false; + IdxNIsKill = true; + } + N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill); + if (!N) + return false; + } + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + } + updateValueMap(I, N); + return true; +} + bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: @@ -3766,9 +4895,9 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { case Instruction::Sub: return selectAddSub(I); case Instruction::Mul: - if (!selectBinaryOp(I, ISD::MUL)) - return selectMul(I); - return true; + return selectMul(I); + case Instruction::SDiv: + return selectSDiv(I); case Instruction::SRem: if (!selectBinaryOp(I, ISD::SREM)) return selectRem(I, ISD::SREM); @@ -3800,13 +4929,8 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { case Instruction::FPToUI: return selectFPToInt(I, /*Signed=*/false); case Instruction::ZExt: - if (!selectCast(I, ISD::ZERO_EXTEND)) - return selectIntExt(I); - return true; case Instruction::SExt: - if (!selectCast(I, ISD::SIGN_EXTEND)) - return selectIntExt(I); - return true; + return selectIntExt(I); case Instruction::Trunc: if (!selectCast(I, ISD::TRUNCATE)) return selectTrunc(I); @@ -3834,6 +4958,8 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectRet(I); case Instruction::FRem: return selectFRem(I); + case Instruction::GetElementPtr: + return selectGetElementPtr(I); } // fall-back to target-independent instruction selection.