X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=e982a9360ba51ae3b04778c3e7b23424b8573f37;hb=c9af33c6854afe7b082af2d892ec5f05dfa383c7;hp=5f29ba60c1976c3a712ba65f7454dc01193f9953;hpb=1e93df6f0b5ee6e36d7ec18e6035f0f5a53e5ec6;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5f29ba60c19..e982a9360ba 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -57,37 +57,24 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); -// Disable16Bit - 16-bit operations typically have a larger encoding than -// corresponding 32-bit instructions, and 16-bit code is slow on some -// processors. This is an experimental flag to disable 16-bit operations -// (which forces them to be Legalized to 32-bit operations). -static cl::opt -Disable16Bit("disable-16bit", cl::Hidden, - cl::desc("Disable use of 16-bit instructions")); -static cl::opt -Promote16Bit("promote-16bit", cl::Hidden, - cl::desc("Promote 16-bit instructions")); - // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { - switch (TM.getSubtarget().TargetType) { - default: llvm_unreachable("unknown subtarget type"); - case X86Subtarget::isDarwin: - if (TM.getSubtarget().is64Bit()) - return new X8664_MachoTargetObjectFile(); + + bool is64Bit = TM.getSubtarget().is64Bit(); + + if (TM.getSubtarget().isTargetDarwin()) { + if (is64Bit) return new X8664_MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); - case X86Subtarget::isELF: - if (TM.getSubtarget().is64Bit()) - return new X8664_ELFTargetObjectFile(TM); + } else if (TM.getSubtarget().isTargetELF() ){ + if (is64Bit) return new X8664_ELFTargetObjectFile(TM); return new X8632_ELFTargetObjectFile(TM); - case X86Subtarget::isMingw: - case X86Subtarget::isCygwin: - case X86Subtarget::isWindows: + } else if (TM.getSubtarget().isTargetCOFF()) { return new TargetLoweringObjectFileCOFF(); - } + } + llvm_unreachable("unknown subtarget type"); } X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) @@ -105,7 +92,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // X86 is weird, it always uses i8 for shift amounts and setcc results. setShiftAmountType(MVT::i8); setBooleanContents(ZeroOrOneBooleanContent); - setSchedulingPreference(SchedulingForRegPressure); + setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); if (Subtarget->isTargetDarwin()) { @@ -123,8 +110,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Set up the register classes. addRegisterClass(MVT::i8, X86::GR8RegisterClass); - if (!Disable16Bit) - addRegisterClass(MVT::i16, X86::GR16RegisterClass); + addRegisterClass(MVT::i16, X86::GR16RegisterClass); addRegisterClass(MVT::i32, X86::GR32RegisterClass); if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, X86::GR64RegisterClass); @@ -133,11 +119,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); - if (!Disable16Bit) - setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); - if (!Disable16Bit) - setTruncStoreAction(MVT::i32, MVT::i16, Expand); + setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); @@ -159,13 +143,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); } else if (!UseSoftFloat) { - if (X86ScalarSSEf64) { - // We have an impenetrably clever algorithm for ui64->double only. - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); - } + // We have an algorithm for SSE2->double, and we turn this into a + // 64-bit FILD followed by conditional FADD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD for other targets. - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have @@ -229,9 +212,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // TODO: when we have SSE, these could be more efficient, by using movd/movq. - if (!X86ScalarSSEf64) { + if (!X86ScalarSSEf64) { setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); + // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. + if (Subtarget->hasMMX() && !DisableMMX) + setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); + else + setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); + } } // Scalar integer divide and remainder are lowered to use operations that @@ -288,13 +279,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); - if (Disable16Bit) { - setOperationAction(ISD::CTTZ , MVT::i16 , Expand); - setOperationAction(ISD::CTLZ , MVT::i16 , Expand); - } else { - setOperationAction(ISD::CTTZ , MVT::i16 , Custom); - setOperationAction(ISD::CTLZ , MVT::i16 , Custom); - } + setOperationAction(ISD::CTTZ , MVT::i16 , Custom); + setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); @@ -311,19 +297,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); - if (Disable16Bit) - setOperationAction(ISD::SELECT , MVT::i16 , Expand); - else - setOperationAction(ISD::SELECT , MVT::i16 , Custom); + setOperationAction(ISD::SELECT , MVT::i16 , Custom); setOperationAction(ISD::SELECT , MVT::i32 , Custom); setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); - if (Disable16Bit) - setOperationAction(ISD::SETCC , MVT::i16 , Expand); - else - setOperationAction(ISD::SETCC , MVT::i16 , Custom); + setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); @@ -365,6 +345,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!Subtarget->hasSSE2()) setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); + // On X86 and X86-64, atomic operations are lowered to locked instructions. + // Locked instructions, in turn, have implicit fence semantics (all memory + // operations are flushed before issuing the locked instruction, and they + // are not buffered), so we can fold away the common pattern of + // fence-atomic-fence. + setShouldFoldAtomicFences(true); // Expand certain atomics setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); @@ -626,11 +612,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { - addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); - addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); - addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); - addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); - addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); + addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); + addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); + addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); + + addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); setOperationAction(ISD::ADD, MVT::v8i8, Legal); setOperationAction(ISD::ADD, MVT::v4i16, Legal); @@ -675,14 +661,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); setOperationAction(ISD::LOAD, MVT::v2i32, Promote); AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); - setOperationAction(ISD::LOAD, MVT::v2f32, Promote); - AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); setOperationAction(ISD::LOAD, MVT::v1i64, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); @@ -690,7 +673,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); @@ -704,6 +686,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); + + if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { + setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); + setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); + setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); + setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); + } } if (!UseSoftFloat && Subtarget->hasSSE1()) { @@ -802,9 +791,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) EVT VT = SVT; // Do not attempt to promote non-128-bit vectors - if (!VT.is128BitVector()) { + if (!VT.is128BitVector()) continue; - } setOperationAction(ISD::AND, SVT, Promote); AddPromotedToType (ISD::AND, SVT, MVT::v2i64); @@ -835,6 +823,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (Subtarget->hasSSE41()) { + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -975,15 +974,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, MVT::i32, Custom); - setOperationAction(ISD::SADDO, MVT::i64, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); - setOperationAction(ISD::UADDO, MVT::i64, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); - setOperationAction(ISD::SSUBO, MVT::i64, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); - setOperationAction(ISD::USUBO, MVT::i64, Custom); setOperationAction(ISD::SMULO, MVT::i32, Custom); - setOperationAction(ISD::SMULO, MVT::i64, Custom); + + // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't + // handle type legalization for these operations here. + // + // FIXME: We really should do custom legalization for addition and + // subtraction on x86-32 once PR3203 is fixed. We really can't do much better + // than generic legalization for 64-bit multiplication-with-overflow, though. + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SADDO, MVT::i64, Custom); + setOperationAction(ISD::UADDO, MVT::i64, Custom); + setOperationAction(ISD::SSUBO, MVT::i64, Custom); + setOperationAction(ISD::USUBO, MVT::i64, Custom); + setOperationAction(ISD::SMULO, MVT::i64, Custom); + } if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. @@ -1002,7 +1010,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::MEMBARRIER); setTargetDAGCombine(ISD::ZERO_EXTEND); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); @@ -1182,6 +1189,27 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; } +bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, + unsigned &Offset) const { + if (!Subtarget->isTargetLinux()) + return false; + + if (Subtarget->is64Bit()) { + // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x28; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x14 on i386 + Offset = 0x14; + AddressSpace = 256; + } + return true; +} + + //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -1192,10 +1220,10 @@ bool X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &OutTys, const SmallVectorImpl &ArgsFlags, - SelectionDAG &DAG) { + LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + RVLocs, Context); return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); } @@ -1203,7 +1231,7 @@ SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - DebugLoc dl, SelectionDAG &DAG) { + DebugLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -1269,10 +1297,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); - if (!Reg) { - Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); - FuncInfo->setSRetReturnReg(Reg); - } + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); @@ -1300,7 +1326,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) const { // Assign locations to each value returned by this call. SmallVector RVLocs; @@ -1395,26 +1421,6 @@ ArgsAreStructReturn(const SmallVectorImpl &Ins) { return Ins[0].Flags.isSRet(); } -/// IsCalleePop - Determines whether the callee is required to pop its -/// own arguments. Callee pop is necessary to support tail calls. -bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ - if (IsVarArg) - return false; - - switch (CallingConv) { - default: - return false; - case CallingConv::X86_StdCall: - return !Subtarget->is64Bit(); - case CallingConv::X86_FastCall: - return !Subtarget->is64Bit(); - case CallingConv::Fast: - return GuaranteedTailCallOpt; - case CallingConv::GHC: - return GuaranteedTailCallOpt; - } -} - /// CCAssignFnForNode - Selects the correct CCAssignFn for a the /// given CallingConvention value. CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { @@ -1429,6 +1435,8 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { if (CC == CallingConv::X86_FastCall) return CC_X86_32_FastCall; + else if (CC == CallingConv::X86_ThisCall) + return CC_X86_32_ThisCall; else if (CC == CallingConv::Fast) return CC_X86_32_FastCC; else if (CC == CallingConv::GHC) @@ -1470,7 +1478,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, DebugLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, - unsigned i) { + unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); @@ -1490,11 +1498,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { int FI = MFI->CreateFixedObject(Flags.getByValSize(), - VA.getLocMemOffset(), isImmutable, false); + VA.getLocMemOffset(), isImmutable); return DAG.getFrameIndex(FI, getPointerTy()); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, - VA.getLocMemOffset(), isImmutable, false); + VA.getLocMemOffset(), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, PseudoSourceValue::getFixedStack(FI), 0, @@ -1509,7 +1517,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) + const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -1619,9 +1628,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { - if (Is64Bit || CallConv != CallingConv::X86_FastCall) { - FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize, - true, false)); + if (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall)) { + FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); } if (Is64Bit) { unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; @@ -1727,7 +1736,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } // Some CCs need callee pop. - if (IsCalleePop(isVarArg, CallConv)) { + if (Subtarget->IsCalleePop(isVarArg, CallConv)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. @@ -1739,7 +1748,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); - if (CallConv == CallingConv::X86_FastCall) + if (CallConv == CallingConv::X86_FastCall || + CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } @@ -1752,7 +1762,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, DebugLoc dl, SelectionDAG &DAG, const CCValAssign &VA, - ISD::ArgFlagsTy Flags) { + ISD::ArgFlagsTy Flags) const { const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); @@ -1771,7 +1781,7 @@ SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, - int FPDiff, DebugLoc dl) { + int FPDiff, DebugLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); @@ -1792,7 +1802,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, // Calculate the new stack slot for the return address. int SlotSize = Is64Bit ? 8 : 4; int NewReturnAddrFI = - MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false); + MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); EVT VT = Is64Bit ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, @@ -1808,7 +1818,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsStructRet = CallIsStructReturn(Outs); @@ -2022,7 +2032,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy()); if (Flags.isByVal()) { @@ -2063,7 +2073,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, FPDiff, dl); } - bool WasGlobalOrExternal = false; if (getTargetMachine().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls @@ -2071,7 +2080,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // pc-relative offset may not be large enough to hold the whole // address. } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - WasGlobalOrExternal = true; // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. @@ -2099,11 +2107,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, OpFlags = X86II::MO_DARWIN_STUB; } - Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), G->getOffset(), OpFlags); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { - WasGlobalOrExternal = true; unsigned char OpFlags = 0; // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external @@ -2157,17 +2164,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Ops.push_back(InFlag); if (isTailCall) { - // If this is the first return lowered for this function, add the regs - // to the liveout set for the function. - if (MF.getRegInfo().liveout_empty()) { - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, - *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC_X86); - for (unsigned i = 0; i != RVLocs.size(); ++i) - if (RVLocs[i].isRegLoc()) - MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); - } + // We used to do: + //// If this is the first return lowered for this function, add the regs + //// to the liveout set for the function. + // This isn't right, although it's probably harmless on x86; liveouts + // should be computed from returns not tail calls. Consider a void + // function making a tail call to a function returning int. return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); } @@ -2177,7 +2179,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush; - if (IsCalleePop(isVarArg, CallConv)) + if (Subtarget->IsCalleePop(isVarArg, CallConv)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) // If this is a call to a struct-return function, the callee @@ -2237,8 +2239,9 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned /// for a 16 byte align requirement. -unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, - SelectionDAG& DAG) { +unsigned +X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); const TargetFrameInfo &TFI = *TM.getFrameInfo(); @@ -2326,15 +2329,17 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + if (GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && - CallerF->getCallingConv() == CalleeCC) + if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; } - // Look for obvious safe cases to perform tail call optimization that does not - // requite ABI changes. This is what gcc calls sibcall. + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. @@ -2366,13 +2371,43 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CCState CCInfo(CalleeCC, false, getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) return false; } } + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector RVLocs1; + CCState CCInfo1(CalleeCC, false, getTargetMachine(), + RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); + + SmallVector RVLocs2; + CCState CCInfo2(CallerCC, false, getTargetMachine(), + RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -2398,7 +2433,6 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - EVT RegVT = VA.getLocVT(); SDValue Arg = Outs[i].Val; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) @@ -2410,6 +2444,24 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } } + + // If the tailcall address may be in a register, then make sure it's + // possible to register allocate for it. In 32-bit, the call address can + // only target EAX, EDX, or ECX since the tail call must be scheduled after + // callee-saved registers are restored. In 64-bit, it's RAX, RCX, RDX, RSI, + // RDI, R8, R9, R11. + if (!isa(Callee) && + !isa(Callee)) { + unsigned Limit = Subtarget->is64Bit() ? 8 : 3; + unsigned NumInRegs = 0; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) { + if (++NumInRegs == Limit) + return false; + } + } + } } return true; @@ -2419,12 +2471,13 @@ FastISel * X86TargetLowering::createFastISel(MachineFunction &mf, DenseMap &vm, DenseMap &bm, - DenseMap &am + DenseMap &am, + std::vector > &pn #ifndef NDEBUG , SmallSet &cil #endif - ) { - return X86::createFastISel(mf, vm, bm, am + ) const { + return X86::createFastISel(mf, vm, bm, am, pn #ifndef NDEBUG , cil #endif @@ -2437,7 +2490,7 @@ X86TargetLowering::createFastISel(MachineFunction &mf, //===----------------------------------------------------------------------===// -SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { +SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -2446,7 +2499,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { // Set up a frame object for the return address. uint64_t SlotSize = TD->getPointerSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, - false, false); + false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -3145,7 +3198,7 @@ unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { return ((isa(Elt) && - cast(Elt)->getZExtValue() == 0) || + cast(Elt)->isNullValue()) || (isa(Elt) && cast(Elt)->getValueAPF().isPosZero())); } @@ -3498,7 +3551,8 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, /// static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, - SelectionDAG &DAG, TargetLowering &TLI) { + SelectionDAG &DAG, + const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); @@ -3543,8 +3597,9 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. /// static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, - unsigned NumNonZero, unsigned NumZero, - SelectionDAG &DAG, TargetLowering &TLI) { + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); @@ -3586,7 +3641,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, SDValue X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into @@ -3718,7 +3773,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, } SDValue -X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); // All zero's are handled with pxor, all one's are handled with pcmpeqd. if (ISD::isBuildVectorAllZeros(Op.getNode()) @@ -3984,7 +4039,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { // We support concatenate two MMX registers and place them in a MMX // register. This is better than doing a stack convert. DebugLoc dl = Op.getDebugLoc(); @@ -4017,7 +4072,8 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) static SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, X86TargetLowering &TLI) { + SelectionDAG &DAG, + const X86TargetLowering &TLI) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -4260,7 +4316,8 @@ SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, X86TargetLowering &TLI) { + SelectionDAG &DAG, + const X86TargetLowering &TLI) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -4399,21 +4456,20 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide -/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be +/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in /// the right sequence. e.g. /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - TargetLowering &TLI, DebugLoc dl) { + const TargetLowering &TLI, DebugLoc dl) { EVT VT = SVOp->getValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); unsigned NumElems = VT.getVectorNumElements(); unsigned NewWidth = (NumElems == 4) ? 2 : 4; EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); - EVT MaskEltVT = MaskVT.getVectorElementType(); EVT NewVT = MaskVT; switch (VT.getSimpleVT().SimpleTy) { default: assert(false && "Unexpected!"); @@ -4638,7 +4694,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -4825,7 +4881,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); if (VT.getSizeInBits() == 8) { @@ -4879,7 +4935,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SDValue -X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { if (!isa(Op.getOperand(1))) return SDValue(); @@ -4943,7 +5000,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ +X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, + SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); DebugLoc dl = Op.getDebugLoc(); @@ -4992,7 +5050,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ } SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); @@ -5021,15 +5079,11 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - if (Op.getValueType() == MVT::v2f32) - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, - Op.getOperand(0)))); - - if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) + + if (Op.getValueType() == MVT::v1i64 && + Op.getOperand(0).getValueType() == MVT::i64) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); @@ -5052,7 +5106,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue -X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the @@ -5085,7 +5139,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { return Result; } -SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the @@ -5119,7 +5173,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { const char *Sym = cast(Op)->getSymbol(); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the @@ -5155,7 +5209,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget->ClassifyBlockAddressReference(); @@ -5194,10 +5248,10 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { // A direct static reference to a global. - Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); + Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); Offset = 0; } else { - Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); + Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); } if (Subtarget->isPICStyleRIPRel() && @@ -5229,7 +5283,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, } SDValue -X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast(Op)->getGlobal(); int64_t Offset = cast(Op)->getOffset(); return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); @@ -5242,7 +5296,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); DebugLoc dl = GA->getDebugLoc(); - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); @@ -5255,7 +5309,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. - MFI->setHasCalls(true); + MFI->setAdjustsStack(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); @@ -5315,7 +5369,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial // exec) - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); @@ -5329,34 +5384,79 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, } SDValue -X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { - // TODO: implement the "local dynamic" model - // TODO: implement the "initial exec"model for pic executables - assert(Subtarget->isTargetELF() && - "TLS not implemented for non-ELF targets"); +X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); - // If GV is an alias then use the aliasee for determining - // thread-localness. - if (const GlobalAlias *GA = dyn_cast(GV)) - GV = GA->resolveAliasedGlobal(false); - - TLSModel::Model model = getTLSModel(GV, - getTargetMachine().getRelocationModel()); - - switch (model) { - case TLSModel::GeneralDynamic: - case TLSModel::LocalDynamic: // not implemented - if (Subtarget->is64Bit()) - return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); - return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); + if (Subtarget->isTargetELF()) { + // TODO: implement the "local dynamic" model + // TODO: implement the "initial exec"model for pic executables + + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast(GV)) + GV = GA->resolveAliasedGlobal(false); + + TLSModel::Model model + = getTLSModel(GV, getTargetMachine().getRelocationModel()); + + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: // not implemented + if (Subtarget->is64Bit()) + return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); + return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); + + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, + Subtarget->is64Bit()); + } + } else if (Subtarget->isTargetDarwin()) { + // Darwin only has one model of TLS. Lower to that. + unsigned char OpFlag = 0; + unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? + X86ISD::WrapperRIP : X86ISD::Wrapper; + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && + !Subtarget->is64Bit(); + if (PIC32) + OpFlag = X86II::MO_TLVP_PIC_BASE; + else + OpFlag = X86II::MO_TLVP; + DebugLoc DL = Op.getDebugLoc(); + SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, + getPointerTy(), + GA->getOffset(), OpFlag); + SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + // With PIC32, the address is actually $g + Offset. + if (PIC32) + Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Offset); + + // Lowering the machine isd will make sure everything is in the right + // location. + SDValue Args[] = { Offset }; + SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); + + // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); - case TLSModel::InitialExec: - case TLSModel::LocalExec: - return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, - Subtarget->is64Bit()); + // And our return value (tls address) is in the standard call return value + // location. + unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); } + + assert(false && + "TLS not implemented for this target."); llvm_unreachable("Unreachable"); return SDValue(); @@ -5365,7 +5465,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { /// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and /// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -5409,7 +5509,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); if (SrcVT.isVector()) { @@ -5444,8 +5545,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { } SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, - SDValue StackSlot, - SelectionDAG &DAG) { + SDValue StackSlot, + SelectionDAG &DAG) const { // Build the FILD DebugLoc dl = Op.getDebugLoc(); SDVTList Tys; @@ -5482,7 +5583,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, } // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. -SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, + SelectionDAG &DAG) const { // This algorithm is not obvious. Here it is in C code, more or less: /* double uint64_to_double( uint32_t hi, uint32_t lo ) { @@ -5566,7 +5668,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { } // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. -SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, + SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), @@ -5611,43 +5714,81 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { return Sub; } -SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); - // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); EVT SrcVT = N0.getValueType(); - if (SrcVT == MVT::i64) { - // We only handle SSE2 f64 target here; caller can expand the rest. - if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) - return SDValue(); - + EVT DstVT = Op.getValueType(); + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); - } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { + else if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); - } - - assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); - SDValue WordOff = DAG.getConstant(4, getPointerTy()); - SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, - getPointerTy(), StackSlot, WordOff); - SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + if (SrcVT == MVT::i32) { + SDValue WordOff = DAG.getConstant(4, getPointerTy()); + SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, + getPointerTy(), StackSlot, WordOff); + SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, NULL, 0, false, false, 0); + SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), + OffsetSlot, NULL, 0, false, false, 0); + SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + return Fild; + } + + assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, NULL, 0, false, false, 0); - SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), - OffsetSlot, NULL, 0, false, false, 0); - return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + // For i64 source, we need to add the appropriate power of 2 if the input + // was negative. This is the same as the optimization in + // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, + // we must be careful to do the computation in x87 extended precision, not + // in SSE. (The generic code can't know it's OK to do this, or how to.) + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); + SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); + + APInt FF(32, 0x5F800000ULL); + + // Check whether the sign bit is set. + SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), + Op.getOperand(0), DAG.getConstant(0, MVT::i64), + ISD::SETLT); + + // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. + SDValue FudgePtr = DAG.getConstantPool( + ConstantInt::get(*DAG.getContext(), FF.zext(64)), + getPointerTy()); + + // Get a pointer to FF if the sign bit was set, or to 0 otherwise. + SDValue Zero = DAG.getIntPtrConstant(0); + SDValue Four = DAG.getIntPtrConstant(4); + SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, + Zero, Four); + FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); + + // Load the value out, extending it from f32 to f80. + // FIXME: Avoid the extend by constructing the right constant pool? + SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), + FudgePtr, PseudoSourceValue::getConstantPool(), + 0, MVT::f32, false, false, 4); + // Extend everything to 80 bits to force it to be done on x87. + SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); + return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); } std::pair X86TargetLowering:: -FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { +FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { DebugLoc dl = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); @@ -5709,7 +5850,8 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { return std::make_pair(FIST, StackSlot); } -SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { if (Op.getValueType().isVector()) { if (Op.getValueType() == MVT::v2i32 && Op.getOperand(0).getValueType() == MVT::v2f64) { @@ -5728,7 +5870,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { FIST, StackSlot, NULL, 0, false, false, 0); } -SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { std::pair Vals = FP_TO_INTHelper(Op, DAG, false); SDValue FIST = Vals.first, StackSlot = Vals.second; assert(FIST.getNode() && "Unexpected failure"); @@ -5738,7 +5881,8 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { FIST, StackSlot, NULL, 0, false, false, 0); } -SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFABS(SDValue Op, + SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -5765,7 +5909,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } -SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -5800,7 +5944,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { } } -SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -5876,7 +6020,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); // CF and OF aren't always set the way we want. Determine which @@ -5884,6 +6028,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, bool NeedCF = false; bool NeedOF = false; switch (X86CC) { + default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; @@ -5893,139 +6038,147 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, case X86::COND_O: case X86::COND_NO: NeedOF = true; break; - default: break; } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. - if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { - unsigned Opcode = 0; - unsigned NumOperands = 0; - switch (Op.getNode()->getOpcode()) { - case ISD::ADD: - // Due to an isel shortcoming, be conservative if this add is likely to - // be selected as part of a load-modify-store instruction. When the root - // node in a match is a store, isel doesn't know how to remap non-chain - // non-flag uses of other nodes in the match, such as the ADD in this - // case. This leads to the ADD being left around and reselected, with - // the result being two adds in the output. - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + if (Op.getResNo() != 0 || NeedOF || NeedCF) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); + + unsigned Opcode = 0; + unsigned NumOperands = 0; + switch (Op.getNode()->getOpcode()) { + case ISD::ADD: + // Due to an isel shortcoming, be conservative if this add is likely to be + // selected as part of a load-modify-store instruction. When the root node + // in a match is a store, isel doesn't know how to remap non-chain non-flag + // uses of other nodes in the match, such as the ADD in this case. This + // leads to the ADD being left around and reselected, with the result being + // two adds in the output. Alas, even if none our users are stores, that + // doesn't prove we're O.K. Ergo, if we have any parents that aren't + // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require + // climbing the DAG back to the root, and it doesn't seem to be worth the + // effort. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() == ISD::STORE) - goto default_case; - if (ConstantSDNode *C = - dyn_cast(Op.getNode()->getOperand(1))) { - // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1) { - Opcode = X86ISD::INC; - NumOperands = 1; - break; - } - // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue()) { - Opcode = X86ISD::DEC; - NumOperands = 1; - break; - } + if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) + goto default_case; + + if (ConstantSDNode *C = + dyn_cast(Op.getNode()->getOperand(1))) { + // An add of one will be selected as an INC. + if (C->getAPIntValue() == 1) { + Opcode = X86ISD::INC; + NumOperands = 1; + break; } - // Otherwise use a regular EFLAGS-setting add. - Opcode = X86ISD::ADD; - NumOperands = 2; - break; - case ISD::AND: { - // If the primary and result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. - bool NonFlagUse = false; - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - unsigned UOpNo = UI.getOperandNo(); - if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { - // Look pass truncate. - UOpNo = User->use_begin().getOperandNo(); - User = *User->use_begin(); - } - if (User->getOpcode() != ISD::BRCOND && - User->getOpcode() != ISD::SETCC && - (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { - NonFlagUse = true; - break; - } + + // An add of negative one (subtract of one) will be selected as a DEC. + if (C->getAPIntValue().isAllOnesValue()) { + Opcode = X86ISD::DEC; + NumOperands = 1; + break; } - if (!NonFlagUse) + } + + // Otherwise use a regular EFLAGS-setting add. + Opcode = X86ISD::ADD; + NumOperands = 2; + break; + case ISD::AND: { + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + bool NonFlagUse = false; + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && + User->getOpcode() != ISD::SETCC && + (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { + NonFlagUse = true; break; + } } + + if (!NonFlagUse) + break; + } // FALL THROUGH - case ISD::SUB: - case ISD::OR: - case ISD::XOR: - // Due to the ISEL shortcoming noted above, be conservative if this op is - // likely to be selected as part of a load-modify-store instruction. - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + case ISD::SUB: + case ISD::OR: + case ISD::XOR: + // Due to the ISEL shortcoming noted above, be conservative if this op is + // likely to be selected as part of a load-modify-store instruction. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() == ISD::STORE) - goto default_case; - // Otherwise use a regular EFLAGS-setting instruction. - switch (Op.getNode()->getOpcode()) { - case ISD::SUB: Opcode = X86ISD::SUB; break; - case ISD::OR: Opcode = X86ISD::OR; break; - case ISD::XOR: Opcode = X86ISD::XOR; break; - case ISD::AND: Opcode = X86ISD::AND; break; - default: llvm_unreachable("unexpected operator!"); - } - NumOperands = 2; - break; - case X86ISD::ADD: - case X86ISD::SUB: - case X86ISD::INC: - case X86ISD::DEC: - case X86ISD::OR: - case X86ISD::XOR: - case X86ISD::AND: - return SDValue(Op.getNode(), 1); - default: - default_case: - break; - } - if (Opcode != 0) { - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SmallVector Ops; - for (unsigned i = 0; i != NumOperands; ++i) - Ops.push_back(Op.getOperand(i)); - SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); - DAG.ReplaceAllUsesWith(Op, New); - return SDValue(New.getNode(), 1); + if (UI->getOpcode() == ISD::STORE) + goto default_case; + + // Otherwise use a regular EFLAGS-setting instruction. + switch (Op.getNode()->getOpcode()) { + default: llvm_unreachable("unexpected operator!"); + case ISD::SUB: Opcode = X86ISD::SUB; break; + case ISD::OR: Opcode = X86ISD::OR; break; + case ISD::XOR: Opcode = X86ISD::XOR; break; + case ISD::AND: Opcode = X86ISD::AND; break; } + + NumOperands = 2; + break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + return SDValue(Op.getNode(), 1); + default: + default_case: + break; } - // Otherwise just emit a CMP with 0, which is the TEST pattern. - if (Promote16Bit && Op.getValueType() == MVT::i16) - Op = DAG.getNode(ISD::ANY_EXTEND, Op.getDebugLoc(), MVT::i32, Op); - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, - DAG.getConstant(0, Op.getValueType())); + if (Opcode == 0) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SmallVector Ops; + for (unsigned i = 0; i != NumOperands; ++i) + Ops.push_back(Op.getOperand(i)); + + SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); + DAG.ReplaceAllUsesWith(Op, New); + return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { if (ConstantSDNode *C = dyn_cast(Op1)) if (C->getAPIntValue() == 0) return EmitTest(Op0, X86CC, DAG); DebugLoc dl = Op0.getDebugLoc(); - if (Promote16Bit && Op0.getValueType() == MVT::i16) { - Op0 = DAG.getNode(ISD::ANY_EXTEND, Op0.getDebugLoc(), MVT::i32, Op0); - Op1 = DAG.getNode(ISD::ANY_EXTEND, Op1.getDebugLoc(), MVT::i32, Op1); - } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. -static SDValue LowerToBT(SDValue And, ISD::CondCode CC, - DebugLoc dl, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, + DebugLoc dl, SelectionDAG &DAG) const { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -6034,15 +6187,21 @@ static SDValue LowerToBT(SDValue And, ISD::CondCode CC, Op1 = Op1.getOperand(0); SDValue LHS, RHS; - if (Op1.getOpcode() == ISD::SHL) { - if (ConstantSDNode *And10C = dyn_cast(Op1.getOperand(0))) - if (And10C->getZExtValue() == 1) { - LHS = Op0; - RHS = Op1.getOperand(1); - } - } else if (Op0.getOpcode() == ISD::SHL) { + if (Op1.getOpcode() == ISD::SHL) + std::swap(Op0, Op1); + if (Op0.getOpcode() == ISD::SHL) { if (ConstantSDNode *And00C = dyn_cast(Op0.getOperand(0))) if (And00C->getZExtValue() == 1) { + // If we looked past a truncate, check that it's only truncating away + // known zeros. + unsigned BitWidth = Op0.getValueSizeInBits(); + unsigned AndBitWidth = And.getValueSizeInBits(); + if (BitWidth > AndBitWidth) { + APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; + DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); + if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) + return SDValue(); + } LHS = Op1; RHS = Op0.getOperand(1); } @@ -6062,7 +6221,7 @@ static SDValue LowerToBT(SDValue And, ISD::CondCode CC, // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (LHS.getValueType() == MVT::i8 || - (Promote16Bit && LHS.getValueType() == MVT::i16)) + LHS.getValueType() == MVT::i16) LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); // If the operand types disagree, extend the shift amount to match. Since @@ -6079,7 +6238,7 @@ static SDValue LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } -SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -6093,7 +6252,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && Op1.getOpcode() == ISD::Constant && - cast(Op1)->getZExtValue() == 0 && + cast(Op1)->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); if (NewSetCC.getNode()) @@ -6133,7 +6292,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(X86CC, MVT::i8), Cond); } -SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue Cond; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -6269,7 +6428,7 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Cond = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); @@ -6389,7 +6548,7 @@ static bool isXor1OfSetCC(SDValue Op) { return false; } -SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); @@ -6473,15 +6632,16 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, MVT::i8); - SDValue User = SDValue(*Op.getNode()->use_begin(), 0); + SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. - if (User.getOpcode() == ISD::BR) { - SDValue FalseBB = User.getOperand(1); - SDValue NewBR = - DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); + (void)NewBR; Dest = FalseBB; Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -6541,7 +6701,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { assert(Subtarget->isTargetCygMing() && "This should be used only on Cygwin/Mingw targets"); DebugLoc dl = Op.getDebugLoc(); @@ -6553,7 +6713,6 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Flag; - EVT IntPtr = getPointerTy(); EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); @@ -6570,220 +6729,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops1, 2, dl); } -SDValue -X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, - const Value *DstSV, - uint64_t DstSVOff) { - ConstantSDNode *ConstantSize = dyn_cast(Size); - - // If not DWORD aligned or size is more than the threshold, call the library. - // The libc version is likely to be faster for these cases. It can use the - // address value and run time information about the CPU. - if ((Align & 3) != 0 || - !ConstantSize || - ConstantSize->getZExtValue() > - getSubtarget()->getMaxInlineSizeThreshold()) { - SDValue InFlag(0, 0); - - // Check to see if there is a specialized entry-point for memory zeroing. - ConstantSDNode *V = dyn_cast(Src); - - if (const char *bzeroEntry = V && - V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { - EVT IntPtr = getPointerTy(); - const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Dst; - Entry.Ty = IntPtrTy; - Args.push_back(Entry); - Entry.Node = Size; - Args.push_back(Entry); - std::pair CallResult = - LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), - false, false, false, false, - 0, CallingConv::C, false, /*isReturnValueUsed=*/false, - DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); - return CallResult.second; - } - - // Otherwise have the target-independent code call memset. - return SDValue(); - } - - uint64_t SizeVal = ConstantSize->getZExtValue(); - SDValue InFlag(0, 0); - EVT AVT; - SDValue Count; - ConstantSDNode *ValC = dyn_cast(Src); - unsigned BytesLeft = 0; - bool TwoRepStos = false; - if (ValC) { - unsigned ValReg; - uint64_t Val = ValC->getZExtValue() & 255; - - // If the value is a constant, then we can potentially use larger sets. - switch (Align & 3) { - case 2: // WORD aligned - AVT = MVT::i16; - ValReg = X86::AX; - Val = (Val << 8) | Val; - break; - case 0: // DWORD aligned - AVT = MVT::i32; - ValReg = X86::EAX; - Val = (Val << 8) | Val; - Val = (Val << 16) | Val; - if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned - AVT = MVT::i64; - ValReg = X86::RAX; - Val = (Val << 32) | Val; - } - break; - default: // Byte aligned - AVT = MVT::i8; - ValReg = X86::AL; - Count = DAG.getIntPtrConstant(SizeVal); - break; - } - - if (AVT.bitsGT(MVT::i8)) { - unsigned UBytes = AVT.getSizeInBits() / 8; - Count = DAG.getIntPtrConstant(SizeVal / UBytes); - BytesLeft = SizeVal % UBytes; - } - - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), - InFlag); - InFlag = Chain.getValue(1); - } else { - AVT = MVT::i8; - Count = DAG.getIntPtrConstant(SizeVal); - Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); - InFlag = Chain.getValue(1); - } - - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); - InFlag = Chain.getValue(1); - - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; - Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); - - if (TwoRepStos) { - InFlag = Chain.getValue(1); - Count = Size; - EVT CVT = Count.getValueType(); - SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, - DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); - Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : - X86::ECX, - Left, InFlag); - InFlag = Chain.getValue(1); - Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; - Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); - } else if (BytesLeft) { - // Handle the last 1 - 7 bytes. - unsigned Offset = SizeVal - BytesLeft; - EVT AddrVT = Dst.getValueType(); - EVT SizeVT = Size.getValueType(); - - Chain = DAG.getMemset(Chain, dl, - DAG.getNode(ISD::ADD, dl, AddrVT, Dst, - DAG.getConstant(Offset, AddrVT)), - Src, - DAG.getConstant(BytesLeft, SizeVT), - Align, isVolatile, DstSV, DstSVOff + Offset); - } - - // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. - return Chain; -} - -SDValue -X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - const Value *DstSV, uint64_t DstSVOff, - const Value *SrcSV, uint64_t SrcSVOff) { - // This requires the copy size to be a constant, preferrably - // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast(Size); - if (!ConstantSize) - return SDValue(); - uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) - return SDValue(); - - /// If not DWORD aligned, call the library. - if ((Align & 3) != 0) - return SDValue(); - - // DWORD aligned - EVT AVT = MVT::i32; - if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned - AVT = MVT::i64; - - unsigned UBytes = AVT.getSizeInBits() / 8; - unsigned CountVal = SizeVal / UBytes; - SDValue Count = DAG.getIntPtrConstant(CountVal); - unsigned BytesLeft = SizeVal % UBytes; - - SDValue InFlag(0, 0); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); - InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : - X86::ESI, - Src, InFlag); - InFlag = Chain.getValue(1); - - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; - SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, - array_lengthof(Ops)); - - SmallVector Results; - Results.push_back(RepMovs); - if (BytesLeft) { - // Handle the last 1 - 7 bytes. - unsigned Offset = SizeVal - BytesLeft; - EVT DstVT = Dst.getValueType(); - EVT SrcVT = Src.getValueType(); - EVT SizeVT = Size.getValueType(); - Results.push_back(DAG.getMemcpy(Chain, dl, - DAG.getNode(ISD::ADD, dl, DstVT, Dst, - DAG.getConstant(Offset, DstVT)), - DAG.getNode(ISD::ADD, dl, SrcVT, Src, - DAG.getConstant(Offset, SrcVT)), - DAG.getConstant(BytesLeft, SizeVT), - Align, isVolatile, AlwaysInline, - DstSV, DstSVOff + Offset, - SrcSV, SrcSVOff + Offset)); - } - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &Results[0], Results.size()); -} - -SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -6843,18 +6789,15 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { &MemOps[0], MemOps.size()); } -SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // X86-64 va_list is a struct { i32, i32, i8*, i8* }. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); - SDValue Chain = Op.getOperand(0); - SDValue SrcPtr = Op.getOperand(1); - SDValue SrcSV = Op.getOperand(2); report_fatal_error("VAArgInst is not yet implemented for x86-64!"); return SDValue(); } -SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // X86-64 va_list is a struct { i32, i32, i8*, i8* }. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); SDValue Chain = Op.getOperand(0); @@ -6870,7 +6813,7 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { } SDValue -X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { +X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { @@ -7111,7 +7054,11 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } } -SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); @@ -7132,9 +7079,10 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { RetAddrFI, NULL, 0, false, false, 0); } -SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); @@ -7147,12 +7095,11 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { } SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { return DAG.getIntPtrConstant(2*TD->getPointerSize()); } -SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) -{ +SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); @@ -7176,7 +7123,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) } SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function @@ -7273,6 +7220,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, break; } case CallingConv::X86_FastCall: + case CallingConv::X86_ThisCall: case CallingConv::Fast: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td @@ -7316,7 +7264,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, } } -SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: @@ -7378,7 +7327,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); } -SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -7412,7 +7361,7 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -7442,7 +7391,7 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { return Op; } -SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); DebugLoc dl = Op.getDebugLoc(); @@ -7487,7 +7436,7 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { } -SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" @@ -7555,7 +7504,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { return Sum; } -SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { EVT T = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned Reg = 0; @@ -7586,7 +7535,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { } SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); SDValue TheChain = Op.getOperand(0); @@ -7604,7 +7553,28 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, + SelectionDAG &DAG) const { + EVT SrcVT = Op.getOperand(0).getValueType(); + EVT DstVT = Op.getValueType(); + assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && + Subtarget->hasMMX() && !DisableMMX) && + "Unexpected custom BIT_CONVERT"); + assert((DstVT == MVT::i64 || + (DstVT.isVector() && DstVT.getSizeInBits()==64)) && + "Unexpected custom BIT_CONVERT"); + // i64 <=> MMX conversions are Legal. + if (SrcVT==MVT::i64 && DstVT.isVector()) + return Op; + if (DstVT==MVT::i64 && SrcVT.isVector()) + return Op; + // MMX <=> MMX conversions are Legal. + if (SrcVT.isVector() && DstVT.isVector()) + return Op; + // All other conversions need to be expanded. + return SDValue(); +} +SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); EVT T = Node->getValueType(0); @@ -7620,7 +7590,7 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { /// LowerOperation - Provide custom lowering hooks for some operations. /// -SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); @@ -7673,12 +7643,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); + case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); } } void X86TargetLowering:: ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, - SelectionDAG &DAG, unsigned NewOp) { + SelectionDAG &DAG, unsigned NewOp) const { EVT T = Node->getValueType(0); DebugLoc dl = Node->getDebugLoc(); assert (T == MVT::i64 && "Only know how to expand i64 atomics"); @@ -7703,7 +7674,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl&Results, /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { DebugLoc dl = N->getDebugLoc(); switch (N->getOpcode()) { default: @@ -7839,6 +7810,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; @@ -8050,8 +8022,11 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - // Move all successors to thisMBB to nextMBB - nextMBB->transferSuccessors(thisMBB); + // Transfer the remainder of thisMBB and its successor edges to nextMBB. + nextMBB->splice(nextMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(bInstr)), + thisMBB->end()); + nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Update thisMBB to fall through to newMBB thisMBB->addSuccessor(newMBB); @@ -8114,7 +8089,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, // insert branch BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); - F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. + bInstr->eraseFromParent(); // The pseudo instruction is gone now. return nextMBB; } @@ -8159,8 +8134,11 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - // Move all successors to thisMBB to nextMBB - nextMBB->transferSuccessors(thisMBB); + // Transfer the remainder of thisMBB and its successor edges to nextMBB. + nextMBB->splice(nextMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(bInstr)), + thisMBB->end()); + nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Update thisMBB to fall through to newMBB thisMBB->addSuccessor(newMBB); @@ -8177,9 +8155,15 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, MachineOperand& dest1Oper = bInstr->getOperand(0); MachineOperand& dest2Oper = bInstr->getOperand(1); MachineOperand* argOpers[2 + X86AddrNumOperands]; - for (int i=0; i < 2 + X86AddrNumOperands; ++i) + for (int i=0; i < 2 + X86AddrNumOperands; ++i) { argOpers[i] = &bInstr->getOperand(i+2); + // We use some of the operands multiple times, so conservatively just + // clear any kill flags that might be present. + if (argOpers[i]->isReg() && argOpers[i]->isUse()) + argOpers[i]->setIsKill(false); + } + // x86 address has 5 operands: base, index, scale, displacement, and segment. int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] @@ -8271,7 +8255,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, // insert branch BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); - F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. + bInstr->eraseFromParent(); // The pseudo instruction is gone now. return nextMBB; } @@ -8305,8 +8289,11 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - // Move all successors of thisMBB to nextMBB - nextMBB->transferSuccessors(thisMBB); + // Transfer the remainder of thisMBB and its successor edges to nextMBB. + nextMBB->splice(nextMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(mInstr)), + thisMBB->end()); + nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Update thisMBB to fall through to newMBB thisMBB->addSuccessor(newMBB); @@ -8374,7 +8361,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, // insert branch BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); - F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. + mInstr->eraseFromParent(); // The pseudo instruction is gone now. return nextMBB; } @@ -8384,7 +8371,6 @@ MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - MachineFunction *F = BB->getParent(); DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -8406,7 +8392,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) .addReg(X86::XMM0); - F->DeleteMachineInstr(MI); + MI->eraseFromParent(); return BB; } @@ -8435,9 +8421,12 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( F->insert(MBBIter, XMMSaveMBB); F->insert(MBBIter, EndMBB); - // Set up the CFG. - // Move any original successors of MBB to the end block. - EndMBB->transferSuccessors(MBB); + // Transfer the remainder of MBB and its successor edges to EndMBB. + EndMBB->splice(EndMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), + MBB->end()); + EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + // The original block will now fall through to the XMM save block. MBB->addSuccessor(XMMSaveMBB); // The XMMSaveMBB will fall through to the end block. @@ -8476,15 +8465,14 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( .addMemOperand(MMO); } - F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + MI->eraseFromParent(); // The pseudo instruction is gone now. return EndMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, - MachineBasicBlock *BB, - DenseMap *EM) const { + MachineBasicBlock *BB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -8506,79 +8494,138 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - unsigned Opc = - X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); - BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); - // Update machine-CFG edges by first adding all successors of the current - // block to the new block which will contain the Phi node for the select. - // Also inform sdisel of the edge changes. - for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), - E = BB->succ_end(); I != E; ++I) { - EM->insert(std::make_pair(*I, sinkMBB)); - sinkMBB->addSuccessor(*I); - } - // Next, remove all successors of the current block, and add the true - // and fallthrough blocks as its successors. - while (!BB->succ_empty()) - BB->removeSuccessor(BB->succ_begin()); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + const MachineFunction *MF = BB->getParent(); + const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); + BitVector ReservedRegs = TRI->getReservedRegs(*MF); + + for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { + const MachineOperand &MO = MI->getOperand(I); + if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; + unsigned Reg = MO.getReg(); + if (Reg != X86::EFLAGS) continue; + copy0MBB->addLiveIn(Reg); + sinkMBB->addLiveIn(Reg); + } + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + // Add the true and fallthrough blocks as its successors. BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); + // Create the conditional branch instruction. + unsigned Opc = + X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB - BB = copy0MBB; - - // Update machine-CFG edges - BB->addSuccessor(sinkMBB); + copy0MBB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - BB = sinkMBB; - BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); - F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. - return BB; + MI->eraseFromParent(); // The pseudo instruction is gone now. + return sinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, - MachineBasicBlock *BB, - DenseMap *EM) const { + MachineBasicBlock *BB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); - MachineFunction *F = BB->getParent(); // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. // FIXME: The code should be tweaked as soon as we'll try to do codegen for // mingw-w64. - BuildMI(BB, DL, TII->get(X86::CALLpcrel32)) + BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol("_alloca") .addReg(X86::EAX, RegState::Implicit) .addReg(X86::ESP, RegState::Implicit) .addReg(X86::EAX, RegState::Define | RegState::Implicit) .addReg(X86::ESP, RegState::Define | RegState::Implicit); - F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const { + // This is pretty easy. We're taking the value that we received from + // our load from the relocation, sticking it in either RDI (x86-64) + // or EAX and doing an indirect call. The return value will then + // be in the normal return register. + const X86InstrInfo *TII + = static_cast(getTargetMachine().getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *F = BB->getParent(); + + assert(MI->getOperand(3).isGlobal() && "This should be a global"); + + if (Subtarget->is64Bit()) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV64rm), X86::RDI) + .addReg(X86::RIP) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); + addDirectMem(MIB, X86::RDI).addReg(0); + } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(0) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX).addReg(0); + } else { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(TII->getGlobalBaseReg(F)) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX).addReg(0); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB, - DenseMap *EM) const { + MachineBasicBlock *BB) const { switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); case X86::MINGW_ALLOCA: - return EmitLoweredMingwAlloca(MI, BB, EM); + return EmitLoweredMingwAlloca(MI, BB); + case X86::TLSCall_32: + case X86::TLSCall_64: + return EmitLoweredTLSCall(MI, BB); case X86::CMOV_GR8: case X86::CMOV_V1I64: case X86::CMOV_FR32: @@ -8591,7 +8638,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: - return EmitLoweredSelect(MI, BB, EM); + return EmitLoweredSelect(MI, BB); case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: @@ -8609,23 +8656,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // mode when truncating to an integer value. MachineFunction *F = BB->getParent(); int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); - addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FNSTCW16m)), CWFrameIdx); // Load the old value of the high byte of the control word... unsigned OldCW = F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); - addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); // Set the high part to be round to zero... - addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) .addImm(0xC7F); // Reload the modified control word now... - addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); // Restore the memory image of control word to original value - addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) .addReg(OldCW); // Get the X86 opcode to use. @@ -8664,30 +8713,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, } else { AM.Disp = Op.getImm(); } - addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) + addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) .addReg(MI->getOperand(X86AddrNumOperands).getReg()); // Reload the original control word now. - addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); - F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. - return BB; - } - // DBG_VALUE. Only the frame index case is done here. - case X86::DBG_VALUE: { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - X86AddressMode AM; - MachineFunction *F = BB->getParent(); - AM.BaseType = X86AddressMode::FrameIndexBase; - AM.Base.FrameIndex = MI->getOperand(0).getImm(); - addFullAddress(BuildMI(BB, DL, TII->get(X86::DBG_VALUE)), AM). - addImm(MI->getOperand(1).getImm()). - addMetadata(MI->getOperand(2).getMetadata()); - F->DeleteMachineInstr(MI); // Remove pseudo. + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } - // String/text processing lowering. case X86::PCMPISTRM128REG: return EmitPCMP(MI, BB, 3, false /* in-mem */); @@ -9594,9 +9629,13 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + EVT VT = N->getValueType(0); - if (VT != MVT::i64 || !Subtarget->is64Bit()) + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) @@ -9606,6 +9645,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); SDValue ShAmt0 = N0.getOperand(1); if (ShAmt0.getValueType() != MVT::i8) @@ -9628,11 +9669,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, std::swap(ShAmt0, ShAmt1); } + unsigned Bits = VT.getSizeInBits(); if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (ConstantSDNode *SumC = dyn_cast(Sum)) { - if (SumC->getSExtValue() == 64 && - ShAmt1.getOperand(1) == ShAmt0) + SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) return DAG.getNode(Opc, DL, VT, Op0, Op1, DAG.getNode(ISD::TRUNCATE, DL, @@ -9641,7 +9685,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, } else if (ConstantSDNode *ShAmt1C = dyn_cast(ShAmt1)) { ConstantSDNode *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && - ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64) + ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, @@ -9807,7 +9851,7 @@ static SDValue PerformBTCombine(SDNode *N, APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); - TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); @@ -9828,58 +9872,6 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// On X86 and X86-64, atomic operations are lowered to locked instructions. -// Locked instructions, in turn, have implicit fence semantics (all memory -// operations are flushed before issuing the locked instruction, and the -// are not buffered), so we can fold away the common pattern of -// fence-atomic-fence. -static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { - SDValue atomic = N->getOperand(0); - switch (atomic.getOpcode()) { - case ISD::ATOMIC_CMP_SWAP: - case ISD::ATOMIC_SWAP: - case ISD::ATOMIC_LOAD_ADD: - case ISD::ATOMIC_LOAD_SUB: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_NAND: - case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_MAX: - case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: - break; - default: - return SDValue(); - } - - SDValue fence = atomic.getOperand(0); - if (fence.getOpcode() != ISD::MEMBARRIER) - return SDValue(); - - switch (atomic.getOpcode()) { - case ISD::ATOMIC_CMP_SWAP: - return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), - atomic.getOperand(1), atomic.getOperand(2), - atomic.getOperand(3)); - case ISD::ATOMIC_SWAP: - case ISD::ATOMIC_LOAD_ADD: - case ISD::ATOMIC_LOAD_SUB: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_NAND: - case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_MAX: - case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: - return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), - atomic.getOperand(1), atomic.getOperand(2)); - default: - return SDValue(); - } -} - static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) @@ -9920,14 +9912,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); - case ISD::OR: return PerformOrCombine(N, DAG, Subtarget); + case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); - case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); } @@ -9941,14 +9932,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; - if (!Promote16Bit || VT != MVT::i16) + if (VT != MVT::i16) return true; switch (Opc) { default: return true; + case ISD::LOAD: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: case ISD::SHL: - case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: @@ -9960,52 +9954,80 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } +static bool MayFoldLoad(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); +} + +static bool MayFoldIntoStore(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); +} + /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { - if (!Promote16Bit) - return false; - EVT VT = Op.getValueType(); if (VT != MVT::i16) return false; - bool Commute = true; + bool Promote = false; + bool Commute = false; switch (Op.getOpcode()) { - default: return false; + default: break; + case ISD::LOAD: { + LoadSDNode *LD = cast(Op); + // If the non-extending load has a single use and it's not live out, then it + // might be folded. + if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& + Op.hasOneUse()*/) { + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + // The only case where we'd want to promote LOAD (rather then it being + // promoted as an operand is when it's only use is liveout. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + } + } + Promote = true; + break; + } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Promote = true; + break; case ISD::SHL: - case ISD::SRA: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). - if (isa(N0) && N0.hasOneUse() && - Op.hasOneUse() && Op.getNode()->use_begin()->getOpcode() == ISD::STORE) + if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) return false; + Promote = true; break; } - case ISD::SUB: - Commute = false; - // fallthrough case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: - case ISD::XOR: { + case ISD::XOR: + Commute = true; + // fallthrough + case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); - if (!Commute && isa(N1)) + if (!Commute && MayFoldLoad(N1)) return false; // Avoid disabling potential load folding opportunities. - if ((isa(N0) && N0.hasOneUse()) && !isa(N1)) + if (MayFoldLoad(N0) && (!isa(N1) || MayFoldIntoStore(Op))) return false; - if ((isa(N1) && N1.hasOneUse()) && !isa(N0)) + if (MayFoldLoad(N1) && (!isa(N0) || MayFoldIntoStore(Op))) return false; + Promote = true; } } PVT = MVT::i32; - return true; + return Promote; } //===----------------------------------------------------------------------===// @@ -10019,8 +10041,8 @@ static bool LowerToBSwap(CallInst *CI) { // so don't worry about this. // Verify this is a simple bswap. - if (CI->getNumOperands() != 2 || - CI->getType() != CI->getOperand(1)->getType() || + if (CI->getNumArgOperands() != 1 || + CI->getType() != CI->getArgOperand(0)->getType() || !CI->getType()->isIntegerTy()) return false; @@ -10033,7 +10055,7 @@ static bool LowerToBSwap(CallInst *CI) { Module *M = CI->getParent()->getParent()->getParent(); Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); - Value *Op = CI->getOperand(1); + Value *Op = CI->getArgOperand(0); Op = CallInst::Create(Int, Op, CI->getName(), CI); CI->replaceAllUsesWith(Op); @@ -10166,7 +10188,6 @@ LowerXConstraint(EVT ConstraintVT) const { /// vector. If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Constraint, - bool hasMemory, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); @@ -10208,9 +10229,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast(Op)) { - const ConstantInt *CI = C->getConstantIntValue(); - if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), - C->getSExtValue())) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); break; @@ -10223,9 +10243,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'Z': { // 32-bit unsigned value if (ConstantSDNode *C = dyn_cast(Op)) { - const ConstantInt *CI = C->getConstantIntValue(); - if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), - C->getZExtValue())) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } @@ -10242,6 +10261,13 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, break; } + // In any sort of PIC mode addresses need to be computed at runtime by + // adding in a register or some sort of table lookup. These can't + // be used as immediates. + if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC() || + Subtarget->isPICStyleRIPRel()) + return; + // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. GlobalAddressSDNode *GA = 0; @@ -10277,11 +10303,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, getTargetMachine()))) return; - if (hasMemory) - Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); - else - Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); - Result = Op; + Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + GA->getValueType(0), Offset); break; } } @@ -10290,8 +10313,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, Ops.push_back(Result); return; } - return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, - Ops, DAG); + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } std::vector X86TargetLowering::