Trailing whitespace

[oota-llvm.git] / lib / Target / ARM / ARMFastISel.cpp
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp

index 450a9bc605c53a5ffea1b26ef9c4b936d576b2fb..3eac44bc8d7098a08c87ac6277f4ac97069f15a0 100644 (file)
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -33,7 +33,9 @@
  #include "llvm/CodeGen/MachineModuleInfo.h"
  #include "llvm/CodeGen/MachineConstantPool.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/Support/CallSite.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/ErrorHandling.h"
@@ -46,19 +48,14 @@
  using namespace llvm;
  
  static cl::opt<bool>
-EnableARMFastISel("arm-fast-isel",
-                    cl::desc("Turn on experimental ARM fast-isel support"),
+DisableARMFastISel("disable-arm-fast-isel",
+                    cl::desc("Turn off experimental ARM fast-isel support"),
                      cl::init(false), cl::Hidden);
  
  namespace {
  
  class ARMFastISel : public FastISel {
  
-  typedef struct AddrBase {
-    unsigned Reg;
-    unsigned FrameIndex;
-  } AddrBase;
-
    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
    /// make the right decision when generating code for different targets.
    const ARMSubtarget *Subtarget;
@@ -122,28 +119,29 @@ class ARMFastISel : public FastISel {
  
      // Instruction selection routines.
    private:
-    virtual bool SelectLoad(const Instruction *I);
-    virtual bool SelectStore(const Instruction *I);
-    virtual bool SelectBranch(const Instruction *I);
-    virtual bool SelectCmp(const Instruction *I);
-    virtual bool SelectFPExt(const Instruction *I);
-    virtual bool SelectFPTrunc(const Instruction *I);
-    virtual bool SelectBinaryOp(const Instruction *I, unsigned ISDOpcode);
-    virtual bool SelectSIToFP(const Instruction *I);
-    virtual bool SelectFPToSI(const Instruction *I);
-    virtual bool SelectSDiv(const Instruction *I);
-    virtual bool SelectSRem(const Instruction *I);
-    virtual bool SelectCall(const Instruction *I);
-    virtual bool SelectSelect(const Instruction *I);
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectBinaryOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectSIToFP(const Instruction *I);
+    bool SelectFPToSI(const Instruction *I);
+    bool SelectSDiv(const Instruction *I);
+    bool SelectSRem(const Instruction *I);
+    bool SelectCall(const Instruction *I);
+    bool SelectSelect(const Instruction *I);
+    bool SelectRet(const Instruction *I);
  
      // Utility routines.
    private:
      bool isTypeLegal(const Type *Ty, EVT &VT);
      bool isLoadTypeLegal(const Type *Ty, EVT &VT);
-    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, AddrBase Base, int Offset);
-    bool ARMEmitStore(EVT VT, unsigned SrcReg, AddrBase Base, int Offset);
-    bool ARMComputeRegOffset(const Value *Obj, AddrBase &Base, int &Offset);
-    void ARMSimplifyRegOffset(AddrBase &Base, int &Offset, EVT VT);
+    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, unsigned Base, int Offset);
+    bool ARMEmitStore(EVT VT, unsigned SrcReg, unsigned Base, int Offset);
+    bool ARMComputeRegOffset(const Value *Obj, unsigned &Base, int &Offset);
+    void ARMSimplifyRegOffset(unsigned &Base, int &Offset, EVT VT);
      unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT);
      unsigned ARMMaterializeInt(const Constant *C, EVT VT);
      unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
@@ -152,6 +150,8 @@ class ARMFastISel : public FastISel {
  
      // Call handling routines.
    private:
+    bool FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                        unsigned &ResultReg);
      CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return);
      bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                           SmallVectorImpl<unsigned> &ArgRegs,
@@ -449,7 +449,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, EVT VT) {
      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                              TII.get(ARM::LDRcp), DestReg)
                      .addConstantPoolIndex(Idx)
-                    .addReg(0).addImm(0));
+                    .addImm(0));
  
    return DestReg;
  }
@@ -521,7 +521,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
    if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
  
    EVT VT;
-  if (!isTypeLegal(AI->getType(), VT)) return false;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return false;
  
    DenseMap<const AllocaInst*, int>::iterator SI =
      FuncInfo.StaticAllocaMap.find(AI);
@@ -565,7 +565,7 @@ bool ARMFastISel::isLoadTypeLegal(const Type *Ty, EVT &VT) {
  }
  
  // Computes the Reg+Offset to get to an object.
-bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, AddrBase &Base,
+bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, unsigned &Base,
                                        int &Offset) {
    // Some boilerplate from the X86 FastISel.
    const User *U = NULL;
@@ -610,7 +610,7 @@ bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, AddrBase &Base,
      }
      case Instruction::GetElementPtr: {
        int SavedOffset = Offset;
-      AddrBase SavedBase = Base;
+      unsigned SavedBase = Base;
        int TmpOffset = Offset;
  
        // Iterate through the GEP folding the constants into offsets where
@@ -632,7 +632,7 @@ bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, AddrBase &Base,
              if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
                // Constant-offset addressing.
                TmpOffset += CI->getSExtValue() * S;
-            } else if (0 && isa<AddOperator>(Op) &&
+            } else if (isa<AddOperator>(Op) &&
                         isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
                // An add with a constant operand. Fold the constant.
                ConstantInt *CI =
@@ -658,21 +658,13 @@ bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, AddrBase &Base,
        break;
      }
      case Instruction::Alloca: {
-      // TODO: Fix this to do intermediate loads, etc.
-      if (Offset != 0) return false;
-
        const AllocaInst *AI = cast<AllocaInst>(Obj);
-      DenseMap<const AllocaInst*, int>::iterator SI =
-        FuncInfo.StaticAllocaMap.find(AI);
-      if (SI != FuncInfo.StaticAllocaMap.end()) {
-        Base.Reg = ARM::SP;
-        Base.FrameIndex = SI->second;
-        return true;
-      }
-      // Don't handle dynamic allocas.
-      assert(!FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Obj)) &&
-             "Alloca should have been handled earlier!");
-      return false;
+      unsigned Reg = TargetMaterializeAlloca(AI);
+
+      if (Reg == 0) return false;
+
+      Base = Reg;
+      return true;
      }
    }
  
@@ -682,20 +674,40 @@ bool ARMFastISel::ARMComputeRegOffset(const Value *Obj, AddrBase &Base,
      unsigned Tmp = ARMMaterializeGV(GV, TLI.getValueType(Obj->getType()));
      if (Tmp == 0) return false;
  
-    Base.Reg = Tmp;
+    Base = Tmp;
      return true;
    }
  
    // Try to get this in a register if nothing else has worked.
-  if (Base.Reg == 0) Base.Reg = getRegForValue(Obj);
-  return Base.Reg != 0;
+  if (Base == 0) Base  = getRegForValue(Obj);
+  return Base != 0;
  }
  
-void ARMFastISel::ARMSimplifyRegOffset(AddrBase &Base, int &Offset, EVT VT) {
+void ARMFastISel::ARMSimplifyRegOffset(unsigned &Base, int &Offset, EVT VT) {
  
-  // Since the offset may be too large for the load instruction
+  assert(VT.isSimple() && "Non-simple types are invalid here!");
+
+  bool needsLowering = false;
+  switch (VT.getSimpleVT().SimpleTy) {
+    default:
+      assert(false && "Unhandled load/store type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      // Integer loads/stores handle 12-bit offsets.
+      needsLowering = ((Offset & 0xfff) != Offset);
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      // Floating point operands handle 8-bit offsets.
+      needsLowering = ((Offset & 0xff) != Offset);
+      break;
+  }
+
+  // Since the offset is too large for the load/store instruction
    // get the reg+offset into a register.
-  if (Base.Reg != ARM::SP && Offset != 0) {
+  if (needsLowering) {
      ARMCC::CondCodes Pred = ARMCC::AL;
      unsigned PredReg = 0;
  
@@ -705,21 +717,21 @@ void ARMFastISel::ARMSimplifyRegOffset(AddrBase &Base, int &Offset, EVT VT) {
  
      if (!isThumb)
        emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                              BaseReg, Base.Reg, Offset, Pred, PredReg,
+                              BaseReg, Base, Offset, Pred, PredReg,
                                static_cast<const ARMBaseInstrInfo&>(TII));
      else {
        assert(AFI->isThumb2Function());
        emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                             BaseReg, Base.Reg, Offset, Pred, PredReg,
+                             BaseReg, Base, Offset, Pred, PredReg,
                               static_cast<const ARMBaseInstrInfo&>(TII));
      }
      Offset = 0;
-    Base.Reg = BaseReg;
+    Base = BaseReg;
    }
  }
  
  bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg,
-                              AddrBase Base, int Offset) {
+                              unsigned Base, int Offset) {
  
    assert(VT.isSimple() && "Non-simple types are invalid here!");
    unsigned Opc;
@@ -730,17 +742,15 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg,
        // This is mostly going to be Neon/vector support.
        return false;
      case MVT::i16:
-      Opc = isThumb ? ARM::t2LDRHi8 : ARM::LDRH;
+      Opc = isThumb ? ARM::t2LDRHi12 : ARM::LDRH;
        RC = ARM::GPRRegisterClass;
-      VT = MVT::i32;
        break;
      case MVT::i8:
-      Opc = isThumb ? ARM::t2LDRBi8 : ARM::LDRB;
+      Opc = isThumb ? ARM::t2LDRBi12 : ARM::LDRBi12;
        RC = ARM::GPRRegisterClass;
-      VT = MVT::i32;
        break;
      case MVT::i32:
-      Opc = isThumb ? ARM::t2LDRi8 : ARM::LDR;
+      Opc = isThumb ? ARM::t2LDRi12 : ARM::LDRi12;
        RC = ARM::GPRRegisterClass;
        break;
      case MVT::f32:
@@ -757,25 +767,16 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg,
  
    ResultReg = createResultReg(RC);
  
-  // For now with the additions above the offset should be zero - thus we
-  // can always fit into an i8.
-  assert((Base.Reg == ARM::SP || Offset == 0) &&
-          "Offset not zero and not a stack load!");
-
-  if (Base.Reg == ARM::SP && Offset == 0)
-    TII.loadRegFromStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt,
-                             ResultReg, Base.FrameIndex, RC,
-                             TM.getRegisterInfo());
-  // The thumb and floating point instructions both take 2 operands, ARM takes
-  // another register.
-  else if (isFloat || isThumb)
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(Opc), ResultReg)
-                    .addReg(Base.Reg).addImm(Offset));
-  else
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(Opc), ResultReg)
-                    .addReg(Base.Reg).addReg(0).addImm(Offset));
+  ARMSimplifyRegOffset(Base, Offset, VT);
+
+  // addrmode5 output depends on the selection dag addressing dividing the
+  // offset by 4 that it then later multiplies. Do this here as well.
+  if (isFloat)
+    Offset /= 4;
+
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(Opc), ResultReg)
+                  .addReg(Base).addImm(Offset));
    return true;
  }
  
@@ -786,15 +787,13 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
      return false;
  
    // Our register and offset with innocuous defaults.
-  AddrBase Base = { 0, 0 };
+  unsigned Base = 0;
    int Offset = 0;
  
    // See if we can handle this as Reg + Offset
    if (!ARMComputeRegOffset(I->getOperand(0), Base, Offset))
      return false;
  
-  ARMSimplifyRegOffset(Base, Offset, VT);
-
    unsigned ResultReg;
    if (!ARMEmitLoad(VT, ResultReg, Base, Offset)) return false;
  
@@ -803,24 +802,20 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
  }
  
  bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg,
-                               AddrBase Base, int Offset) {
+                               unsigned Base, int Offset) {
    unsigned StrOpc;
    bool isFloat = false;
-  // VT is set here only for use in the alloca stores below - those are promoted
-  // to reg size always.
    switch (VT.getSimpleVT().SimpleTy) {
      default: return false;
      case MVT::i1:
      case MVT::i8:
-      VT = MVT::i32;
-      StrOpc = isThumb ? ARM::t2STRBi8 : ARM::STRB;
+      StrOpc = isThumb ? ARM::t2STRBi12 : ARM::STRB;
        break;
      case MVT::i16:
-      VT = MVT::i32;
-      StrOpc = isThumb ? ARM::t2STRHi8 : ARM::STRH;
+      StrOpc = isThumb ? ARM::t2STRHi12 : ARM::STRH;
        break;
      case MVT::i32:
-      StrOpc = isThumb ? ARM::t2STRi8 : ARM::STR;
+      StrOpc = isThumb ? ARM::t2STRi12 : ARM::STR;
        break;
      case MVT::f32:
        if (!Subtarget->hasVFP2()) return false;
@@ -834,20 +829,23 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg,
        break;
    }
  
-  if (Base.Reg == ARM::SP && Offset == 0)
-    TII.storeRegToStackSlot(*FuncInfo.MBB, *FuncInfo.InsertPt,
-                            SrcReg, true /*isKill*/, Base.FrameIndex,
-                            TLI.getRegClassFor(VT), TM.getRegisterInfo());
+  ARMSimplifyRegOffset(Base, Offset, VT);
+
+  // addrmode5 output depends on the selection dag addressing dividing the
+  // offset by 4 that it then later multiplies. Do this here as well.
+  if (isFloat)
+    Offset /= 4;
+
    // The thumb addressing mode has operands swapped from the arm addressing
    // mode, the floating point one only has two operands.
-  else if (isFloat || isThumb)
+  if (isFloat || isThumb)
      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                              TII.get(StrOpc))
-                    .addReg(SrcReg).addReg(Base.Reg).addImm(Offset));
+                    .addReg(SrcReg).addReg(Base).addImm(Offset));
    else
      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                              TII.get(StrOpc))
-                    .addReg(SrcReg).addReg(Base.Reg).addReg(0).addImm(Offset));
+                    .addReg(SrcReg).addReg(Base).addReg(0).addImm(Offset));
  
    return true;
  }
@@ -867,15 +865,13 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
      return false;
  
    // Our register and offset with innocuous defaults.
-  AddrBase Base = { 0, 0 };
+  unsigned Base = 0;
    int Offset = 0;
  
    // See if we can handle this as Reg + Offset
    if (!ARMComputeRegOffset(I->getOperand(1), Base, Offset))
      return false;
  
-  ARMSimplifyRegOffset(Base, Offset, VT);
-
    if (!ARMEmitStore(VT, SrcReg, Base, Offset)) return false;
  
    return true;
@@ -1242,6 +1238,18 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
  
  // Call Handling Code
  
+bool ARMFastISel::FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src,
+                                 EVT SrcVT, unsigned &ResultReg) {
+  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+                           Src, /*TODO: Kill=*/false);
+
+  if (RR != 0) {
+    ResultReg = RR;
+    return true;
+  } else
+    return false;
+}
+
  // This is largely taken directly from CCAssignFnForNode - we don't support
  // varargs in FastISel so that part has been removed.
  // TODO: We may not support all of this.
@@ -1249,8 +1257,12 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
    switch (CC) {
    default:
      llvm_unreachable("Unsupported calling convention");
-  case CallingConv::C:
    case CallingConv::Fast:
+    // Ignore fastcc. Silence compiler warnings.
+    (void)RetFastCC_ARM_APCS;
+    (void)FastCC_ARM_APCS;
+    // Fallthrough
+  case CallingConv::C:
      // Use target triple & subtarget features to do actual dispatch.
      if (Subtarget->isAAPCS_ABI()) {
        if (Subtarget->hasVFP2() &&
@@ -1295,27 +1307,85 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
      unsigned Arg = ArgRegs[VA.getValNo()];
      EVT ArgVT = ArgVTs[VA.getValNo()];
  
+    // We don't handle NEON parameters yet.
+    if (VA.getLocVT().isVector() && VA.getLocVT().getSizeInBits() > 64)
+      return false;
+
      // Handle arg promotion, etc.
      switch (VA.getLocInfo()) {
        case CCValAssign::Full: break;
-      default:
-      // TODO: Handle arg promotion.
-      return false;
+      case CCValAssign::SExt: {
+        bool Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                         Arg, ArgVT, Arg);
+        assert(Emitted && "Failed to emit a sext!"); Emitted=Emitted;
+        Emitted = true;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      case CCValAssign::ZExt: {
+        bool Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                         Arg, ArgVT, Arg);
+        assert(Emitted && "Failed to emit a zext!"); Emitted=Emitted;
+        Emitted = true;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      case CCValAssign::AExt: {
+        bool Emitted = FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+                                         Arg, ArgVT, Arg);
+        if (!Emitted)
+          Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                      Arg, ArgVT, Arg);
+        if (!Emitted)
+          Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                      Arg, ArgVT, Arg);
+
+        assert(Emitted && "Failed to emit a aext!"); Emitted=Emitted;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      case CCValAssign::BCvt: {
+        unsigned BC = FastEmit_r(ArgVT.getSimpleVT(),
+                                 VA.getLocVT().getSimpleVT(),
+                                 ISD::BIT_CONVERT, Arg, /*TODO: Kill=*/false);
+        assert(BC != 0 && "Failed to emit a bitcast!");
+        Arg = BC;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      default: llvm_unreachable("Unknown arg promotion!");
      }
  
      // Now copy/store arg to correct locations.
-    // TODO: We need custom lowering for f64 args.
      if (VA.isRegLoc() && !VA.needsCustom()) {
        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
                VA.getLocReg())
        .addReg(Arg);
        RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      if (VA.getLocVT() != MVT::f64) return false;
+
+      CCValAssign &NextVA = ArgLocs[++i];
+
+      // TODO: Only handle register args for now.
+      if(!(VA.isRegLoc() && NextVA.isRegLoc())) return false;
+
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(ARM::VMOVRRD), VA.getLocReg())
+                      .addReg(NextVA.getLocReg(), RegState::Define)
+                      .addReg(Arg));
+      RegArgs.push_back(VA.getLocReg());
+      RegArgs.push_back(NextVA.getLocReg());
      } else {
-      // Need to store
-      return false;
+      assert(VA.isMemLoc());
+      // Need to store on the stack.
+      unsigned Base = ARM::SP;
+      int Offset = VA.getLocMemOffset();
+
+      if (!ARMEmitStore(ArgVT, Arg, Base, Offset)) return false;
      }
    }
-
    return true;
  }
  
@@ -1338,24 +1408,16 @@ bool ARMFastISel::FinishCall(EVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
      if (RVLocs.size() == 2 && RetVT.getSimpleVT().SimpleTy == MVT::f64) {
        // For this move we copy into two registers and then move into the
        // double fp reg we want.
-      // TODO: Are the copies necessary?
-      TargetRegisterClass *CopyRC = TLI.getRegClassFor(MVT::i32);
-      unsigned Copy1 = createResultReg(CopyRC);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              Copy1).addReg(RVLocs[0].getLocReg());
-      UsedRegs.push_back(RVLocs[0].getLocReg());
-
-      unsigned Copy2 = createResultReg(CopyRC);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              Copy2).addReg(RVLocs[1].getLocReg());
-      UsedRegs.push_back(RVLocs[1].getLocReg());
-
        EVT DestVT = RVLocs[0].getValVT();
        TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
        unsigned ResultReg = createResultReg(DstRC);
        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                TII.get(ARM::VMOVDRR), ResultReg)
-                      .addReg(Copy1).addReg(Copy2));
+                      .addReg(RVLocs[0].getLocReg())
+                      .addReg(RVLocs[1].getLocReg()));
+
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[1].getLocReg());
  
        // Finally update the result.
        UpdateValueMap(I, ResultReg);
@@ -1377,6 +1439,69 @@ bool ARMFastISel::FinishCall(EVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
    return true;
  }
  
+bool ARMFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
+                  Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    // TODO: For now, don't try to handle cases where getLocInfo()
+    // says Full but the types don't match.
+    if (VA.getValVT() != TLI.getValueType(RV->getType()))
+      return false;
+
+    // Make the copy.
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            DstReg).addReg(SrcReg);
+
+    // Mark the register as live out of the function.
+    MRI.addLiveOut(VA.getLocReg());
+  }
+
+  unsigned RetOpc = isThumb ? ARM::tBX_RET : ARM::BX_RET;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(RetOpc)));
+  return true;
+}
+
  // A quick function that will emit a call for a named libcall in F with the
  // vector of passed arguments for the Instruction in I. We can assume that we
  // can emit a call for any libcall we can produce. This is an abridged version
@@ -1472,11 +1597,8 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
    // Check the calling convention.
    ImmutableCallSite CS(CI);
    CallingConv::ID CC = CS.getCallingConv();
+
    // TODO: Avoid some calling conventions?
-  if (CC != CallingConv::C) {
-    // errs() << "Can't handle calling convention: " << CC << "\n";
-    return false;
-  }
  
    // Let SDISel handle vararg functions.
    const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
@@ -1607,6 +1729,8 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
        return SelectCall(I);
      case Instruction::Select:
        return SelectSelect(I);
+    case Instruction::Ret:
+      return SelectRet(I);
      default: break;
    }
    return false;
@@ -1617,7 +1741,7 @@ namespace llvm {
      // Completely untested on non-darwin.
      const TargetMachine &TM = funcInfo.MF->getTarget();
      const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    if (Subtarget->isTargetDarwin() && EnableARMFastISel)
+    if (Subtarget->isTargetDarwin() && !DisableARMFastISel)
        return new ARMFastISel(funcInfo);
      return 0;
    }