ARM LDM/STM system instruction variants.

[oota-llvm.git] / lib / Target / ARM / ARMFastISel.cpp
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp

index 432abb500cea2ead218429d43e567561d86ede87..0a4faa2c52ef806c382f264c06504a0fc7ea68a6 100644 (file)
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -37,7 +37,6 @@
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineMemOperand.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/Support/CallSite.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/ErrorHandling.h"
@@ -179,12 +178,15 @@ class ARMFastISel : public FastISel {
      bool isLoadTypeLegal(Type *Ty, MVT &VT);
      bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                      bool isZExt);
-    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, bool isZExt,
-                     bool allocReg);
+    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, unsigned Alignment = 0,
+                     bool isZExt = true, bool allocReg = true);
                       
-    bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr);
+    bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
+                      unsigned Alignment = 0);
      bool ARMComputeAddress(const Value *Obj, Address &Addr);
      void ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3);
+    bool ARMIsMemCpySmall(uint64_t Len);
+    bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len);
      unsigned ARMEmitIntExt(EVT SrcVT, unsigned SrcReg, EVT DestVT, bool isZExt);
      unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT);
      unsigned ARMMaterializeInt(const Constant *C, EVT VT);
@@ -195,8 +197,6 @@ class ARMFastISel : public FastISel {
  
      // Call handling routines.
    private:
-    bool FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
-                        unsigned &ResultReg);
      CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return);
      bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                           SmallVectorImpl<unsigned> &ArgRegs,
@@ -228,8 +228,7 @@ class ARMFastISel : public FastISel {
  // we don't care about implicit defs here, just places we'll need to add a
  // default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
  bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
-  const MCInstrDesc &MCID = MI->getDesc();
-  if (!MCID.hasOptionalDef())
+  if (!MI->hasOptionalDef())
      return false;
  
    // Look to see if our OptionalDef is defining CPSR or CCR.
@@ -685,6 +684,8 @@ unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
    return 0;
  }
  
+// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
+
  unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
    // Don't handle dynamic allocas.
    if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
@@ -870,12 +871,17 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) {
      case MVT::i8:
      case MVT::i16:
      case MVT::i32:
-      if (!useAM3)
+      if (!useAM3) {
          // Integer loads/stores handle 12-bit offsets.
          needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset);
-      else
+        // Handle negative offsets.
+        if (needsLowering && isThumb2)
+          needsLowering = !(Subtarget->hasV6T2Ops() && Addr.Offset < 0 &&
+                            Addr.Offset > -256);
+      } else {
          // ARM halfword load/stores and signed byte loads use +/-imm8 offsets.
          needsLowering = (Addr.Offset > 255 || Addr.Offset < -255);
+      }
        break;
      case MVT::f32:
      case MVT::f64:
@@ -931,7 +937,8 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
      // Now add the rest of the operands.
      MIB.addFrameIndex(FI);
  
-    // ARM halfword load/stores and signed byte loads need an additional operand.
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
      if (useAM3) {
        signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
        MIB.addReg(0);
@@ -944,7 +951,8 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
      // Now add the rest of the operands.
      MIB.addReg(Addr.Base.Reg);
  
-    // ARM halfword load/stores and signed byte loads need an additional operand.
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
      if (useAM3) {
        signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
        MIB.addReg(0);
@@ -957,41 +965,73 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
  }
  
  bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
-                              bool isZExt = true, bool allocReg = true) {
+                              unsigned Alignment, bool isZExt, bool allocReg) {
    assert(VT.isSimple() && "Non-simple types are invalid here!");
    unsigned Opc;
    bool useAM3 = false;
+  bool needVMOV = false;
    TargetRegisterClass *RC;  
    switch (VT.getSimpleVT().SimpleTy) {
      // This is mostly going to be Neon/vector support.
      default: return false;
      case MVT::i1:
      case MVT::i8:
-      if (isZExt) {
-        Opc = isThumb2 ? ARM::t2LDRBi12 : ARM::LDRBi12;
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRBi8 : ARM::t2LDRSBi8;
+        else
+          Opc = isZExt ? ARM::t2LDRBi12 : ARM::t2LDRSBi12;
        } else {
-        Opc = isThumb2 ? ARM::t2LDRSBi12 : ARM::LDRSB;
-        if (!isThumb2) useAM3 = true;
+        if (isZExt) {
+          Opc = ARM::LDRBi12;
+        } else {
+          Opc = ARM::LDRSB;
+          useAM3 = true;
+        }
        }
        RC = ARM::GPRRegisterClass;
        break;
      case MVT::i16:
-      if (isZExt)
-        Opc = isThumb2 ? ARM::t2LDRHi12 : ARM::LDRH;
-      else
-        Opc = isThumb2 ? ARM::t2LDRSHi12 : ARM::LDRSH;
-      if (!isThumb2) useAM3 = true;
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
+        else
+          Opc = isZExt ? ARM::t2LDRHi12 : ARM::t2LDRSHi12;
+      } else {
+        Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
+        useAM3 = true;
+      }
        RC = ARM::GPRRegisterClass;
        break;
      case MVT::i32:
-      Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = ARM::t2LDRi8;
+        else
+          Opc = ARM::t2LDRi12;
+      } else {
+        Opc = ARM::LDRi12;
+      }
        RC = ARM::GPRRegisterClass;
        break;
      case MVT::f32:
-      Opc = ARM::VLDRS;
-      RC = TLI.getRegClassFor(VT);
+      // Unaligned loads need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        needVMOV = true;
+        VT = MVT::i32;
+        Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+        RC = ARM::GPRRegisterClass;
+      } else {
+        Opc = ARM::VLDRS;
+        RC = TLI.getRegClassFor(VT);
+      }
        break;
      case MVT::f64:
+      if (Alignment && Alignment < 4) {
+        // FIXME: Unaligned loads need special handling.  Doublewords require
+        // word-alignment.
+        return false;
+      }
        Opc = ARM::VLDRD;
        RC = TLI.getRegClassFor(VT);
        break;
@@ -1006,6 +1046,16 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                      TII.get(Opc), ResultReg);
    AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
+
+  // If we had an unaligned load of a float we've converted it to an regular
+  // load.  Now we must move from the GRP to the FP register.
+  if (needVMOV) {
+    unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::VMOVSR), MoveReg)
+                    .addReg(ResultReg));
+    ResultReg = MoveReg;
+  }
    return true;
  }
  
@@ -1024,12 +1074,14 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
    if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
  
    unsigned ResultReg;
-  if (!ARMEmitLoad(VT, ResultReg, Addr)) return false;
+  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
    UpdateValueMap(I, ResultReg);
    return true;
  }
  
-bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
+bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
+                               unsigned Alignment) {
    unsigned StrOpc;
    bool useAM3 = false;
    switch (VT.getSimpleVT().SimpleTy) {
@@ -1045,21 +1097,57 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
        SrcReg = Res;
      } // Fallthrough here.
      case MVT::i8:
-      StrOpc = isThumb2 ? ARM::t2STRBi12 : ARM::STRBi12;
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRBi8;
+        else
+          StrOpc = ARM::t2STRBi12;
+      } else {
+        StrOpc = ARM::STRBi12;
+      }
        break;
      case MVT::i16:
-      StrOpc = isThumb2 ? ARM::t2STRHi12 : ARM::STRH;
-      if (!isThumb2) useAM3 = true;
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRHi8;
+        else
+          StrOpc = ARM::t2STRHi12;
+      } else {
+        StrOpc = ARM::STRH;
+        useAM3 = true;
+      }
        break;
      case MVT::i32:
-      StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRi8;
+        else
+          StrOpc = ARM::t2STRi12;
+      } else {
+        StrOpc = ARM::STRi12;
+      }
        break;
      case MVT::f32:
        if (!Subtarget->hasVFP2()) return false;
        StrOpc = ARM::VSTRS;
+      // Unaligned stores need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(ARM::VMOVRS), MoveReg)
+                        .addReg(SrcReg));
+        SrcReg = MoveReg;
+        VT = MVT::i32;
+        StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+      }
        break;
      case MVT::f64:
        if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned stores need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4) {
+          return false;
+      }
        StrOpc = ARM::VSTRD;
        break;
    }
@@ -1069,7 +1157,7 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
    // Create the base instruction, then add the operands.
    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                      TII.get(StrOpc))
-                            .addReg(SrcReg, getKillRegState(true));
+                            .addReg(SrcReg);
    AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
    return true;
  }
@@ -1096,7 +1184,8 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
    if (!ARMComputeAddress(I->getOperand(1), Addr))
      return false;
  
-  if (!ARMEmitStore(VT, SrcReg, Addr)) return false;
+  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
    return true;
  }
  
@@ -1258,6 +1347,8 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
    int Imm = 0;
    bool UseImm = false;
    bool isNegativeImm = false;
+  // FIXME: At -O0 we don't have anything that canonicalizes operand order.
+  // Thus, Src1Value may be a ConstantInt, but we're missing it.
    if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
      if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 ||
          SrcVT == MVT::i1) {
@@ -1313,7 +1404,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
    unsigned SrcReg1 = getRegForValue(Src1Value);
    if (SrcReg1 == 0) return false;
  
-  unsigned SrcReg2;
+  unsigned SrcReg2 = 0;
    if (!UseImm) {
      SrcReg2 = getRegForValue(Src2Value);
      if (SrcReg2 == 0) return false;
@@ -1530,7 +1621,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
        (ARM_AM::getSOImmVal(Imm) != -1);
    }
  
-  unsigned Op2Reg;
+  unsigned Op2Reg = 0;
    if (!UseImm) {
      Op2Reg = getRegForValue(I->getOperand(2));
      if (Op2Reg == 0) return false;
@@ -1623,12 +1714,6 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
    if (isFloat && !Subtarget->hasVFP2())
      return false;
  
-  unsigned Op1 = getRegForValue(I->getOperand(0));
-  if (Op1 == 0) return false;
-
-  unsigned Op2 = getRegForValue(I->getOperand(1));
-  if (Op2 == 0) return false;
-
    unsigned Opc;
    bool is64bit = VT == MVT::f64 || VT == MVT::i64;
    switch (ISDOpcode) {
@@ -1643,6 +1728,12 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
        Opc = is64bit ? ARM::VMULD : ARM::VMULS;
        break;
    }
+  unsigned Op1 = getRegForValue(I->getOperand(0));
+  if (Op1 == 0) return false;
+
+  unsigned Op2 = getRegForValue(I->getOperand(1));
+  if (Op2 == 0) return false;
+
    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                            TII.get(Opc), ResultReg)
@@ -1653,18 +1744,6 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
  
  // Call Handling Code
  
-bool ARMFastISel::FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src,
-                                 EVT SrcVT, unsigned &ResultReg) {
-  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
-                           Src, /*TODO: Kill=*/false);
-
-  if (RR != 0) {
-    ResultReg = RR;
-    return true;
-  } else
-    return false;
-}
-
  // This is largely taken directly from CCAssignFnForNode - we don't support
  // varargs in FastISel so that part has been removed.
  // TODO: We may not support all of this.
@@ -1681,7 +1760,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) {
      // Use target triple & subtarget features to do actual dispatch.
      if (Subtarget->isAAPCS_ABI()) {
        if (Subtarget->hasVFP2() &&
-          FloatABIType == FloatABI::Hard)
+          TM.Options.FloatABIType == FloatABI::Hard)
          return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
        else
          return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
@@ -1730,21 +1809,23 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
      switch (VA.getLocInfo()) {
        case CCValAssign::Full: break;
        case CCValAssign::SExt: {
-        EVT DestVT = VA.getLocVT();
+        MVT DestVT = VA.getLocVT();
          unsigned ResultReg = ARMEmitIntExt(ArgVT, Arg, DestVT,
                                             /*isZExt*/false);
          assert (ResultReg != 0 && "Failed to emit a sext");
          Arg = ResultReg;
+        ArgVT = DestVT;
          break;
        }
        case CCValAssign::AExt:
          // Intentional fall-through.  Handle AExt and ZExt.
        case CCValAssign::ZExt: {
-        EVT DestVT = VA.getLocVT();
+        MVT DestVT = VA.getLocVT();
          unsigned ResultReg = ARMEmitIntExt(ArgVT, Arg, DestVT,
                                             /*isZExt*/true);
          assert (ResultReg != 0 && "Failed to emit a sext");
          Arg = ResultReg;
+        ArgVT = DestVT;
          break;
        }
        case CCValAssign::BCvt: {
@@ -2073,9 +2154,6 @@ bool ARMFastISel::SelectCall(const Instruction *I,
      if (IntrMemName && e-i <= 2)
        break;
  
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
      ISD::ArgFlagsTy Flags;
      unsigned AttrInd = i - CS.arg_begin() + 1;
      if (CS.paramHasAttr(AttrInd, Attribute::SExt))
@@ -2095,6 +2173,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
      if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
          ArgVT != MVT::i1)
        return false;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
      unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
      Flags.setOrigAlign(OriginalAlignment);
  
@@ -2149,18 +2232,70 @@ bool ARMFastISel::SelectCall(const Instruction *I,
    return true;
  }
  
+bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) {
+  return Len <= 16;
+}
+
+bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!ARMIsMemCpySmall(Len))
+    return false;
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else {
+      assert(Len == 1);
+      VT = MVT::i8;
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = ARMEmitLoad(VT, ResultReg, Src);
+    assert (RV = true && "Should be able to handle this load.");
+    RV = ARMEmitStore(VT, ResultReg, Dest);
+    assert (RV = true && "Should be able to handle this store.");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    Dest.Offset += Size;
+    Src.Offset += Size;
+  }
+
+  return true;
+}
+
  bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
    // FIXME: Handle more intrinsics.
    switch (I.getIntrinsicID()) {
    default: return false;
    case Intrinsic::memcpy:
    case Intrinsic::memmove: {
-    // FIXME: Small memcpy/memmove's are common enough that we want to do them
-    // without a call if possible.
      const MemTransferInst &MTI = cast<MemTransferInst>(I);
      // Don't handle volatile.
      if (MTI.isVolatile())
        return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      if (ARMIsMemCpySmall(Len)) {
+        Address Dest, Src;
+        if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
+            !ARMComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        if (ARMTryEmitSmallMemCpy(Dest, Src, Len))
+          return true;
+      }
+    }
      
      if (!MTI.getLength()->getType()->isIntegerTy(32))
        return false;
@@ -2367,7 +2502,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
    if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
    
    unsigned ResultReg = MI->getOperand(0).getReg();
-  if (!ARMEmitLoad(VT, ResultReg, Addr, isZExt, false))
+  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
      return false;
    MI->eraseFromParent();
    return true;