Remove the old CodePlacementOpt pass.

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index 82b475a44f38a2eb5a121476337bed1630522223..bb26090d2d8d3889142be31851bb91a95cce4cca 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -504,6 +504,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
      setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
      setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
  
      setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
      setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
@@ -521,6 +522,23 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
      setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
  
+    // Mark v2f32 intrinsics.
+    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
      // Neon does not support some operations on v1i64 and v2i64 types.
      setOperationAction(ISD::MUL, MVT::v1i64, Expand);
      // Custom handling for some quad-vector types to detect VMULL.
@@ -546,6 +564,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
      setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
  
+    // Custom expand long extensions to vectors.
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
+
      // NEON does not have single instruction CTPOP for vectors with element
      // types wider than 8-bits.  However, custom lowering can leverage the
      // v8i8/v16i8 vcnt instruction.
@@ -554,6 +582,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
      setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
  
+    // NEON only has FMA instructions as of VFP4.
+    if (!Subtarget->hasVFP4()) {
+      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+    }
+
      setTargetDAGCombine(ISD::INTRINSIC_VOID);
      setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
      setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -835,21 +869,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setSchedulingPreference(Sched::Hybrid);
  
    //// temporary - rewrite interface to use type
-  maxStoresPerMemset = 8;
-  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
-  maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
-  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
-  maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
-  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
  
    // On ARM arguments smaller than 4 bytes are extended, so all arguments
    // are at least 4 bytes aligned.
    setMinStackArgumentAlignment(4);
  
-  benefitFromCodePlacementOpt = true;
-
    // Prefer likely predicted branches to selects on out-of-order cores.
-  predictableSelectIsExpensive = Subtarget->isLikeA9();
+  PredictableSelectIsExpensive = Subtarget->isLikeA9();
  
    setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
  }
@@ -1581,7 +1613,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        // On ELF targets for PIC code, direct calls should go through the PLT
        unsigned OpFlags = 0;
        if (Subtarget->isTargetELF() &&
-                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+          getTargetMachine().getRelocationModel() == Reloc::PIC_)
          OpFlags = ARMII::MO_PLT;
        Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
      }
@@ -1928,15 +1960,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
    CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
                                                 isVarArg));
  
-  // If this is the first return lowered for this function, add
-  // the regs to the liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
    SDValue Flag;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
  
    // Copy the result values into the output registers.
    for (unsigned i = 0, realRVLocIdx = 0;
@@ -1965,10 +1991,12 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
  
          Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
          Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
          VA = RVLocs[++i]; // skip ahead to next loc
          Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                   HalfGPRs.getValue(1), Flag);
          Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
          VA = RVLocs[++i]; // skip ahead to next loc
  
          // Extract the 2nd half and fall through to handle it as an f64 value.
@@ -1981,6 +2009,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
                                    DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
        Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
        VA = RVLocs[++i]; // skip ahead to next loc
        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
                                 Flag);
@@ -1990,15 +2019,16 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
      // Guarantee that all emitted copies are
      // stuck together, avoiding something bad.
      Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    }
  
-  SDValue result;
+  // Update chain and glue.
+  RetOps[0] = Chain;
    if (Flag.getNode())
-    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
-  else // Return Void
-    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
+    RetOps.push_back(Flag);
  
-  return result;
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
+                     RetOps.data(), RetOps.size());
  }
  
  bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2249,8 +2279,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
    EVT PtrVT = getPointerTy();
    DebugLoc dl = Op.getDebugLoc();
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  if (RelocM == Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
      bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
      ARMConstantPoolValue *CPV =
        ARMConstantPoolConstant::Create(GV,
@@ -2294,8 +2323,6 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
    DebugLoc dl = Op.getDebugLoc();
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  
    // FIXME: Enable this for static codegen when tool issues are fixed.  Also
    // update ARMFastISel::ARMMaterializeGV.
@@ -2323,6 +2350,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
    if (RelocM == Reloc::Static) {
      CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
    } else {
+    ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
      ARMPCLabelIndex = AFI->createPICLabelUId();
      unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
      ARMConstantPoolValue *CPV =
@@ -2403,7 +2431,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
      EVT PtrVT = getPointerTy();
-    DebugLoc dl = Op.getDebugLoc();
      Reloc::Model RelocM = getTargetMachine().getRelocationModel();
      SDValue CPAddr;
      unsigned PCAdj = (RelocM != Reloc::PIC_)
@@ -2578,7 +2605,7 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
  }
  
  // The remaining GPRs hold either the beginning of variable-argument
-// data, or the beginning of an aggregate passed by value (usuall
+// data, or the beginning of an aggregate passed by value (usually
  // byval).  Either way, we allocate stack slots adjacent to the data
  // provided by our caller, and store the unallocated registers there.
  // If this is a variadic function, the va_list pointer will begin with
@@ -2663,7 +2690,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
    CCInfo.AnalyzeFormalArguments(Ins,
                                  CCAssignFnForNode(CallConv, /* Return*/ false,
                                                    isVarArg));
-  
+
    SmallVector<SDValue, 16> ArgValues;
    int lastInsIndex = -1;
    SDValue ArgValue;
@@ -2778,7 +2805,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
              } else {
                int FI = MFI->CreateFixedObject(Flags.getByValSize(),
                                                VA.getLocMemOffset(), false);
-              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));              
+              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
              }
            } else {
              int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
@@ -3414,6 +3441,47 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    return FrameAddr;
  }
  
+/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
+/// and size(DestVec) > 128-bits.
+/// This is achieved by doing the one extension from the SrcVec, splitting the
+/// result, extending these parts, and then concatenating these into the
+/// destination.
+static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+  EVT DestVT = N->getValueType(0);
+
+  assert(DestVT.getSizeInBits() > 128 &&
+         "Custom sext/zext expansion needs >128-bit vector.");
+  // If this is a normal length extension, use the default expansion.
+  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
+      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
+    return SDValue();
+
+  DebugLoc dl = N->getDebugLoc();
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
+  unsigned NumElts = SrcVT.getVectorNumElements();
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
+
+  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
+                               NumElts);
+  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
+                                 NumElts/2);
+  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
+                               NumElts/2);
+
+  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
+  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
+                        DAG.getIntPtrConstant(0));
+  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
+                        DAG.getIntPtrConstant(NumElts/2));
+  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
+  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
+}
+
  /// ExpandBITCAST - If the target supports VFP, this function is called to
  /// expand a bit convert where either the source or destination type is i64 to
  /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -3575,7 +3643,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
  /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
  /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
  /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
-/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
  ///            [b0 b1 b2 b3 b4 b5 b6 b7]
  ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
  /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
@@ -3596,7 +3664,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
  /// bit-count for each 16-bit element from the operand.  We need slightly
  /// different sequencing for v4i16 and v8i16 to stay within NEON's available
  /// 64/128-bit registers.
-/// 
+///
  /// Trace for v4i16:
  /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
  /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
@@ -3627,7 +3695,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
  /// input    = [v0    v1    ] (vi: 32-bit elements)
  /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
  /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
-/// vrev: N0 = [k1 k0 k3 k2 ] 
+/// vrev: N0 = [k1 k0 k3 k2 ]
  ///            [k0 k1 k2 k3 ]
  ///       N1 =+[k1 k0 k3 k2 ]
  ///            [k0 k2 k1 k3 ]
@@ -4296,6 +4364,21 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
    return true;
  }
  
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size())
+      return false;
+
+  // Look for <15, ..., 3, -1, 1, 0>.
+  for (unsigned i = 0; i != NumElts; ++i)
+    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+      return false;
+
+  return true;
+}
+
  // If N is an integer constant that can be moved into a register in one
  // instruction, return an SDValue of such a constant (will become a MOV
  // instruction).  Otherwise return null.
@@ -4390,7 +4473,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  
      ValueCounts.insert(std::make_pair(V, 0));
      unsigned &Count = ValueCounts[V];
-    
+
      // Is this value dominant? (takes up more than half of the lanes)
      if (++Count > (NumElts / 2)) {
        hasDominantValue = true;
@@ -4418,8 +4501,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  
        // If we are VDUPing a value that comes directly from a vector, that will
        // cause an unnecessary move to and from a GPR, where instead we could
-      // just use VDUPLANE.
-      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      // just use VDUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the VDUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
          // We need to create a new undef vector to use for the VDUPLANE if the
          // size of the vector from which we get the value is different than the
          // size of the vector that we need to create. We will insert the element
@@ -4434,12 +4520,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                   DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
                          Value, DAG.getConstant(index, MVT::i32)),
                             DAG.getConstant(index, MVT::i32));
-        } else {
+        } else
            N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                          Value->getOperand(0), Value->getOperand(1));
-        }
-      }
-      else
+      } else
          N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
  
        if (!usesOnlyOneValue) {
@@ -4471,7 +4555,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
      if (usesOnlyOneValue) {
        SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
        if (isConstant && Val.getNode())
-        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
      }
    }
  
@@ -4691,7 +4775,8 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isVZIPMask(M, VT, WhichResult) ||
            isVTRN_v_undef_Mask(M, VT, WhichResult) ||
            isVUZP_v_undef_Mask(M, VT, WhichResult) ||
-          isVZIP_v_undef_Mask(M, VT, WhichResult));
+          isVZIP_v_undef_Mask(M, VT, WhichResult) ||
+          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
  }
  
  /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
@@ -4795,6 +4880,23 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
                                   &VTBLMask[0], 8));
  }
  
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+         "Expect an v8i16/v16i8 type");
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  // extract the first 8 bytes into the top double word and the last 8 bytes
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, MVT::i32));
+}
+
  static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
@@ -4932,6 +5034,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(ISD::BITCAST, dl, VT, Val);
    }
  
+  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
    if (VT == MVT::v8i8) {
      SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
      if (NewOp.getNode())
@@ -5565,6 +5670,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::BITCAST:
      Res = ExpandBITCAST(N, DAG);
      break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = ExpandVectorExtension(N, DAG);
+    break;
    case ISD::SRL:
    case ISD::SRA:
      Res = Expand64BitShift(N, DAG, Subtarget);
@@ -6295,6 +6404,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
      MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
    unsigned MJTI = JTI->createJumpTableIndex(LPadList);
    unsigned UId = AFI->createJumpTableUId();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
  
    // Create the MBBs for the dispatch code.
  
@@ -6304,14 +6414,11 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
    MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
    unsigned trap_opcode;
-  if (Subtarget->isThumb()) {
+  if (Subtarget->isThumb())
      trap_opcode = ARM::tTRAP;
-  } else {
-    if (Subtarget->useNaClTrap())
-      trap_opcode = ARM::TRAPNaCl;
-    else
-      trap_opcode = ARM::TRAP;
-  }
+  else
+    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
    BuildMI(TrapBB, dl, TII->get(trap_opcode));
    DispatchBB->addSuccessor(TrapBB);
  
@@ -6458,11 +6565,14 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                     .addImm(0)
                     .addMemOperand(JTMMOLd));
  
-    unsigned NewVReg6 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg5, RegState::Kill)
-                   .addReg(NewVReg3));
+    unsigned NewVReg6 = NewVReg5;
+    if (RelocM == Reloc::PIC_) {
+      NewVReg6 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                     .addReg(ARM::CPSR, RegState::Define)
+                     .addReg(NewVReg5, RegState::Kill)
+                     .addReg(NewVReg3));
+    }
  
      BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
        .addReg(NewVReg6, RegState::Kill)
@@ -6542,11 +6652,18 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
        .addImm(0)
        .addMemOperand(JTMMOLd));
  
-    BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
-      .addReg(NewVReg5, RegState::Kill)
-      .addReg(NewVReg4)
-      .addJumpTableIndex(MJTI)
-      .addImm(UId);
+    if (RelocM == Reloc::PIC_) {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+        .addReg(NewVReg5, RegState::Kill)
+        .addReg(NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    } else {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+        .addReg(NewVReg5, RegState::Kill)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    }
    }
  
    // Add the jump table entries as successors to the MBB.