[ARM] Minor refactoring. NFC.

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index e35bf49d6b9e633a4cd3e8f7475057cacd1f7db9..a3af444ced939fb66550e91be4b8a2dda3d850e5 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -368,11 +368,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
        { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
        { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
        { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
-
-      { RTLIB::SDIV_I32, "__rt_sdiv",   CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UDIV_I32, "__rt_udiv",   CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
      };
  
      for (const auto &LC : LibraryCalls) {
@@ -743,6 +738,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::UDIV,  MVT::i32, Expand);
    }
  
+  if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
+    setOperationAction(ISD::SDIV, MVT::i32, Custom);
+    setOperationAction(ISD::UDIV, MVT::i32, Custom);
+
+    setOperationAction(ISD::SDIV, MVT::i64, Custom);
+    setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  }
+
    setOperationAction(ISD::SREM,  MVT::i32, Expand);
    setOperationAction(ISD::UREM,  MVT::i32, Expand);
    // Register based DivRem for AEABI (RTABI 4.2)
@@ -1119,6 +1122,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
  
    case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
+  case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
  
    case ARMISD::VCEQ:          return "ARMISD::VCEQ";
    case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
@@ -1174,6 +1178,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
    case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
    case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
    case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
    case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
    case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
@@ -2052,6 +2057,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    CallingConv::ID CallerCC = CallerF->getCallingConv();
    bool CCMatch = CallerCC == CalleeCC;
  
+  assert(Subtarget->supportsTailCall());
+
    // Look for obvious safe cases to perform tail call optimization that do not
    // require ABI changes. This is what gcc calls sibcall.
  
@@ -2071,26 +2078,6 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (isCalleeStructRet || isCallerStructRet)
      return false;
  
-  // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
-  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
-  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
-  // support in the assembler and linker to be used. This would need to be
-  // fixed to fully support tail calls in Thumb1.
-  //
-  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
-  // LR.  This means if we need to reload LR, it takes an extra instructions,
-  // which outweighs the value of the tail call; but here we don't know yet
-  // whether LR is going to be used.  Probably the right approach is to
-  // generate the tail call here and turn it back into CALL/RET in
-  // emitEpilogue if LR is used.
-
-  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
-  // but we need to make sure there are enough registers; the only valid
-  // registers are the 4 used for parameters.  We don't currently do this
-  // case.
-  if (Subtarget->isThumb1Only())
-    return false;
-
    // Externally-defined functions with weak linkage should not be
    // tail-called on ARM when the OS does not support dynamic
    // pre-emption of symbols, as the AAELF spec requires normal calls
@@ -2438,7 +2425,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
    if (!CI->isTailCall() || Attr.getValueAsString() == "true")
      return false;
  
-  return !Subtarget->isThumb1Only();
+  return true;
  }
  
  // Trying to write a 64 bit value so need to split into two 32 bit values first,
@@ -6645,6 +6632,85 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                       LoadSin.getValue(0), LoadCos.getValue(0));
  }
  
+SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
+                                                  bool Signed,
+                                                  SDValue &Chain) const {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  const auto &DL = DAG.getDataLayout();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  const char *Name = nullptr;
+  if (Signed)
+    Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+  else
+    Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+
+  SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+
+  ARMTargetLowering::ArgListTy Args;
+
+  for (auto AI : {1, 0}) {
+    ArgListEntry Arg;
+    Arg.Node = Op.getOperand(AI);
+    Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Args.push_back(Arg);
+  }
+
+  CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+    .setChain(Chain)
+    .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
+               ES, std::move(Args), 0);
+
+  return LowerCallTo(CLI).first;
+}
+
+SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
+                                            bool Signed) const {
+  assert(Op.getValueType() == MVT::i32 &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
+                               DAG.getEntryNode(), Op.getOperand(1));
+
+  return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+}
+
+void ARMTargetLowering::ExpandDIV_Windows(
+    SDValue Op, SelectionDAG &DAG, bool Signed,
+    SmallVectorImpl<SDValue> &Results) const {
+  const auto &DL = DAG.getDataLayout();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  assert(Op.getValueType() == MVT::i64 &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
+                           DAG.getConstant(0, dl, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
+                           DAG.getConstant(1, dl, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi);
+
+  SDValue DBZCHK =
+      DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or);
+
+  SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+
+  SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
+  SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
+                              DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+  Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
+
+  Results.push_back(Lower);
+  Results.push_back(Upper);
+}
+
  static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
    // Monotonic load/store is legal for all targets
    if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
@@ -6736,8 +6802,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
    case ISD::MUL:           return LowerMUL(Op, DAG);
-  case ISD::SDIV:          return LowerSDIV(Op, DAG);
-  case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::SDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ true);
+    return LowerSDIV(Op, DAG);
+  case ISD::UDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ false);
+    return LowerUDIV(Op, DAG);
    case ISD::ADDC:
    case ISD::ADDE:
    case ISD::SUBC:
@@ -6758,13 +6830,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
      llvm_unreachable("Don't know how to custom lower this!");
    case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
    case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ARMISD::WIN__DBZCHK: return SDValue();
    }
  }
  
  /// ReplaceNodeResults - Replace the results of node with an illegal result
  /// type with new values built out of custom code.
  void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
-                                           SmallVectorImpl<SDValue>&Results,
+                                           SmallVectorImpl<SDValue> &Results,
                                             SelectionDAG &DAG) const {
    SDValue Res;
    switch (N->getOpcode()) {
@@ -6787,6 +6860,11 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::READCYCLECOUNTER:
      ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
      return;
+  case ISD::UDIV:
+  case ISD::SDIV:
+    assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
+    return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
+                             Results);
    }
    if (Res.getNode())
      Results.push_back(Res);
@@ -7710,6 +7788,32 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
    return MBB;
  }
  
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
+  MF->push_back(ContBB);
+  ContBB->splice(ContBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  MBB->addSuccessor(ContBB);
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  MF->push_back(TrapBB);
+  BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249);
+  MBB->addSuccessor(TrapBB);
+
+  BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
+      .addReg(MI->getOperand(0).getReg())
+      .addMBB(TrapBB);
+
+  MI->eraseFromParent();
+  return ContBB;
+}
+
  MachineBasicBlock *
  ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
@@ -7964,11 +8068,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return EmitStructByval(MI, BB);
    case ARM::WIN__CHKSTK:
      return EmitLowered__chkstk(MI, BB);
+  case ARM::WIN__DBZCHK:
+    return EmitLowered__dbzchk(MI, BB);
+  }
+}
+
+/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// when it is expanded into LDM/STM. This is done as a post-isel lowering
+/// instead of as a custom inserter because we need the use list from the SDNode.
+static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
+                                   MachineInstr *MI, const SDNode *Node) {
+  bool isThumb1 = Subtarget->isThumb1Only();
+
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+
+  // If the new dst/src is unused mark it as dead.
+  if (!Node->hasAnyUseOfValue(0)) {
+    MI->getOperand(0).setIsDead(true);
+  }
+  if (!Node->hasAnyUseOfValue(1)) {
+    MI->getOperand(1).setIsDead(true);
+  }
+
+  // The MEMCPY both defines and kills the scratch registers.
+  for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
+    unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+                                                         : &ARM::GPRRegClass);
+    MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
    }
  }
  
  void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                        SDNode *Node) const {
+  if (MI->getOpcode() == ARM::MEMCPY) {
+    attachMEMCPYScratchRegs(Subtarget, MI, Node);
+    return;
+  }
+
    const MCInstrDesc *MCID = &MI->getDesc();
    // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
    // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
@@ -9696,30 +9835,30 @@ static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
  ///  vcvt.s32.f32    d16, d16
  /// becomes:
  ///  vcvt.s32.f32    d16, d16, #3
-static SDValue PerformVCVTCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
                                    const ARMSubtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
    SDValue Op = N->getOperand(0);
+  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+    return SDValue();
  
-  if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
-      Op.getOpcode() != ISD::FMUL)
+  SDValue ConstVec = Op->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
      return SDValue();
  
    uint64_t C;
-  SDValue N0 = Op->getOperand(0);
-  SDValue ConstVec = Op->getOperand(1);
    bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
-
-  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
-      !isConstVecPow2(ConstVec, isSigned, C))
+  if (!isConstVecPow2(ConstVec, isSigned, C))
      return SDValue();
  
    MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
    MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
    unsigned NumLanes = Op.getValueType().getVectorNumElements();
-  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 ||
-      NumLanes > 4) {
+  if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
      // These instructions only exist converting from f32 to i32. We can handle
      // smaller integers by generating an extra truncate, but larger ones would
      // be lossy. We also can't handle more then 4 lanes, since these intructions
@@ -9730,13 +9869,12 @@ static SDValue PerformVCVTCombine(SDNode *N,
    SDLoc dl(N);
    unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
      Intrinsic::arm_neon_vcvtfp2fxu;
-  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
-                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
-                                 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
-                                 N0,
-                                 DAG.getConstant(Log2_64(C), dl, MVT::i32));
+  SDValue FixConv = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
+      DAG.getConstant(Log2_64(C), dl, MVT::i32));
  
-  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+  if (IntBits < FloatBits)
      FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
  
    return FixConv;
@@ -9751,23 +9889,24 @@ static SDValue PerformVCVTCombine(SDNode *N,
  ///  vdiv.f32        d16, d17, d16
  /// becomes:
  ///  vcvt.f32.s32    d16, d16, #3
-static SDValue PerformVDIVCombine(SDNode *N,
-                                  TargetLowering::DAGCombinerInfo &DCI,
+static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
                                    const ARMSubtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
    SDValue Op = N->getOperand(0);
    unsigned OpOpcode = Op.getNode()->getOpcode();
-
-  if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
+  if (!N->getValueType(0).isVector() ||
        (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
      return SDValue();
  
-  uint64_t C;
    SDValue ConstVec = N->getOperand(1);
-  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
  
-  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
-      !isConstVecPow2(ConstVec, isSigned, C))
+  uint64_t C;
+  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
+  if (!isConstVecPow2(ConstVec, isSigned, C))
      return SDValue();
  
    MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
@@ -10176,8 +10315,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
    case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
    case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
-  case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
+  case ISD::FP_TO_UINT:
+    return PerformVCVTCombine(N, DCI.DAG, Subtarget);
+  case ISD::FDIV:
+    return PerformVDIVCombine(N, DCI.DAG, Subtarget);
    case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
    case ISD::SHL:
    case ISD::SRA:
@@ -11616,6 +11757,8 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
  
  void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
      IRBuilder<> &Builder) const {
+  if (!Subtarget->hasV7Ops())
+    return;
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
    Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
  }
@@ -11695,9 +11838,6 @@ bool ARMTargetLowering::lowerInterleavedLoad(
                                              Intrinsic::arm_neon_vld3,
                                              Intrinsic::arm_neon_vld4};
  
-  Function *VldnFunc =
-      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
-
    IRBuilder<> Builder(LI);
    SmallVector<Value *, 2> Ops;
  
@@ -11705,6 +11845,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
    Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
    Ops.push_back(Builder.getInt32(LI->getAlignment()));
  
+  Type *Tys[] = { VecTy, Int8Ptr };
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
    CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
  
    // Replace uses of each shufflevector with the corresponding vector loaded
@@ -11796,14 +11939,15 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
    static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
                                         Intrinsic::arm_neon_vst3,
                                         Intrinsic::arm_neon_vst4};
-  Function *VstNFunc = Intrinsic::getDeclaration(
-      SI->getModule(), StoreInts[Factor - 2], SubVecTy);
-
    SmallVector<Value *, 6> Ops;
  
    Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
    Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
  
+  Type *Tys[] = { Int8Ptr, SubVecTy };
+  Function *VstNFunc = Intrinsic::getDeclaration(
+      SI->getModule(), StoreInts[Factor - 2], Tys);
+
    // Split the shufflevector operands into sub vectors for the new vstN call.
    for (unsigned i = 0; i < Factor; i++)
      Ops.push_back(Builder.CreateShuffleVector(