[X86][SSE] Add general memory folding for (V)INSERTPS instruction

author Simon Pilgrim <llvm-dev@redking.me.uk>

Wed, 4 Nov 2015 20:48:09 +0000 (20:48 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Wed, 4 Nov 2015 20:48:09 +0000 (20:48 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Wed, 4 Nov 2015 20:48:09 +0000 (20:48 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Wed, 4 Nov 2015 20:48:09 +0000 (20:48 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 3434f8c7f0bf5bd5b308455f2191f57db4677365..685067a28c60de023e74ff418e7878ddabfd4738 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26174,52 +26174,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
-                                         SelectionDAG &DAG) {
-  SDLoc dl(Load);
-  MVT VT = Load->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  SDValue Addr = Load->getOperand(1);
-  SDValue NewAddr = DAG.getNode(
-      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-      DAG.getConstant(Index * EVT.getStoreSize(), dl,
-                      Addr.getSimpleValueType()));
-
-  SDValue NewLoad =
-      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                  DAG.getMachineFunction().getMachineMemOperand(
-                      Load->getMemOperand(), 0, EVT.getStoreSize()));
-  return NewLoad;
-}
-
-static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget *Subtarget) {
-  SDLoc dl(N);
-  MVT VT = N->getOperand(1)->getSimpleValueType(0);
-  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
-         "X86insertps is only defined for v4x32");
-
-  SDValue Ld = N->getOperand(1);
-  if (MayFoldLoad(Ld)) {
-    // Extract the countS bits from the immediate so we can get the proper
-    // address when narrowing the vector load to a specific element.
-    // When the second source op is a memory address, insertps doesn't use
-    // countS and just gets an f32 from that address.
-    unsigned DestIndex =
-        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-
-    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
-
-    // Create this as a scalar to vector to match the instruction pattern.
-    SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
-    // countS bits are ignored when loading from memory on insertps, which
-    // means we don't need to explicitly set them to 0.
-    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
-                       LoadScalarToVector, N->getOperand(2));
-  }
-  return SDValue();
-}
-
  static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
    SDValue V0 = N->getOperand(0);
    SDValue V1 = N->getOperand(1);
@@ -26685,11 +26639,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::VPERM2X128:
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
    case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
-  case X86ISD::INSERTPS: {
-    if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
-      return PerformINSERTPSCombine(N, DAG, Subtarget);
-    break;
-  }
    case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
    }
  
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 81c61d997c025d665c6d1f67431722922d899992..a5b3a12e9102c579146c69b2b1702d585d1dac1b 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -4876,12 +4876,35 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
    return false;
  }
  
-static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+                        int PtrOffset = 0) {
    unsigned NumAddrOps = MOs.size();
-  for (unsigned i = 0; i != NumAddrOps; ++i)
-    MIB.addOperand(MOs[i]);
-  if (NumAddrOps < 4) // FrameIndex only
-    addOffset(MIB, 0);
+
+  if (NumAddrOps < 4) {
+    // FrameIndex only - add an immediate offset (whether its zero or not).
+    for (unsigned i = 0; i != NumAddrOps; ++i)
+      MIB.addOperand(MOs[i]);
+    addOffset(MIB, PtrOffset);
+  } else {
+    // General Memory Addressing - we need to add any offset to an existing
+    // offset.
+    assert(MOs.size() == 5 && "Unexpected memory operand list length");
+    for (unsigned i = 0; i != NumAddrOps; ++i) {
+      const MachineOperand &MO = MOs[i];
+      if (i == 3 && PtrOffset != 0) {
+        assert((MO.isImm() || MO.isGlobal()) &&
+               "Unexpected memory operand type");
+        if (MO.isImm()) {
+          MIB.addImm(MO.getImm() + PtrOffset);
+        } else {
+          MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset() + PtrOffset,
+                               MO.getTargetFlags());
+        }
+      } else {
+        MIB.addOperand(MO);
+      }
+    }
+  }
  }
  
  static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
@@ -4916,7 +4939,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
  static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
                                unsigned OpNo, ArrayRef<MachineOperand> MOs,
                                MachineBasicBlock::iterator InsertPt,
-                              MachineInstr *MI, const TargetInstrInfo &TII) {
+                              MachineInstr *MI, const TargetInstrInfo &TII,
+                              int PtrOffset = 0) {
    // Omit the implicit operands, something BuildMI can't do.
    MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
                                                MI->getDebugLoc(), true);
@@ -4926,7 +4950,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
      MachineOperand &MO = MI->getOperand(i);
      if (i == OpNo) {
        assert(MO.isReg() && "Expected to fold into reg operand!");
-      addOperands(MIB, MOs);
+      addOperands(MIB, MOs, PtrOffset);
      } else {
        MIB.addOperand(MO);
      }
@@ -4948,6 +4972,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
    return MIB.addImm(0);
  }
  
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align) const {
+  switch (MI->getOpcode()) {
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    // Attempt to convert the load of inserted vector into a fold load
+    // of a single float.
+    if (OpNum == 2) {
+      unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+      unsigned ZMask = Imm & 15;
+      unsigned DstIdx = (Imm >> 4) & 3;
+      unsigned SrcIdx = (Imm >> 6) & 3;
+
+      unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 4 <= Align) {
+        int PtrOffset = SrcIdx * 4;
+        unsigned NewImm = (DstIdx << 4) | ZMask;
+        unsigned NewOpCode =
+            (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+                                                 : X86::INSERTPSrm);
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+        NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+        return NewMI;
+      }
+    }
+    break;
+  };
+
+  return nullptr;
+}
+
  MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
      MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
      ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
@@ -4977,6 +5035,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
      return nullptr;
  
    MachineInstr *NewMI = nullptr;
+
+  // Attempt to fold any custom cases we have.
+  if (NewMI =
+          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+    return NewMI;
+
    // Folding a memory location into the two-address part of a two-address
    // instruction is different than folding it other places.  It requires
    // replacing the *two* registers with the memory location.
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h

index e83a10f3b81c5eb987a48dfdb89bcd8ba2947f18..3f6b030c442da8630438968266e0e00e3d94daf9 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -512,6 +512,14 @@ private:
                                                MachineBasicBlock::iterator &MBBI,
                                                LiveVariables *LV) const;
  
+  /// Handles memory folding for special case instructions, for instance those
+  /// requiring custom manipulation of the address.
+  MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+                                        unsigned OpNum,
+                                        ArrayRef<MachineOperand> MOs,
+                                        MachineBasicBlock::iterator InsertPt,
+                                        unsigned Size, unsigned Align) const;
+
    /// isFrameOperand - Return true and the FrameIndex if the specified
    /// operand and follow operands form a reference to the stack frame.
    bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll

index f71ec5c10e69aa6151c44e59fd0f4c3b6a808eb4..341dd867e4ff47d18343180a9423201aa293cedf 100644 (file)
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -32,7 +32,7 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
  ; On X32, account for the argument's move to registers
  ; X32: movl    4(%esp), %eax
  ; CHECK-NOT: mov
-; CHECK: insertps    $48
+; CHECK: vinsertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  ; CHECK-NEXT: ret
    %1 = load <4 x float>, <4 x float>* %pb, align 16
    %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -46,7 +46,7 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
  ; X32: movl    4(%esp), %eax
  ; CHECK-NOT: mov
  ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK: vinsertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  ; CHECK-NEXT: ret
    %1 = load <4 x float>, <4 x float>* %pb, align 16
    %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
  ; X32: movl    8(%esp), %ecx
  ; CHECK-NOT: mov
  ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  ; CHECK-NEXT: ret
    %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
    %2 = load <4 x float>, <4 x float>* %1, align 16
diff --git a/test/CodeGen/X86/insertps-unfold-load-bug.ll b/test/CodeGen/X86/insertps-unfold-load-bug.ll

new file mode 100644 (file)

index 0000000..bf7c4bc
--- /dev/null
+++ b/test/CodeGen/X86/insertps-unfold-load-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64
+
+; Test for case where insertps was folding the load of the insertion element, but a later optimization
+; was then manipulating the load.
+
+define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) {
+; X32-LABEL: insertps_unfold:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movaps (%eax), %xmm0
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_unfold:
+; X64:       # BB#0:
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movaps (%rdi), %xmm0
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %a = getelementptr inbounds <4 x float>, <4 x float>* %v1, i64 0, i64 1
+  %b = load float, float* %a, align 4
+  %c = insertelement <4 x float> undef, float %b, i32 0
+  %d = load <4 x float>, <4 x float>* %v1, align 16
+  %e = load <4 x float>, <4 x float>* %v0, align 16
+  %f = shufflevector <4 x float> %e, <4 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  %g = fadd <4 x float> %c, %f
+  ret <4 x float> %g
+}
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll

index d624c8dcbb470810f0f718d8a752cc3055502b02..0a83a9753b81a0939e482508d1341cfbb57f8614 100644 (file)
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -794,12 +794,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
  ; X32-LABEL: insertps_from_vector_load:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_from_vector_load:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  ; X64-NEXT:    retq
    %1 = load <4 x float>, <4 x float>* %pb, align 16
    %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -812,12 +812,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
  ; X32-LABEL: insertps_from_vector_load_offset:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_from_vector_load_offset:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3]
+; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  ; X64-NEXT:    retq
    %1 = load <4 x float>, <4 x float>* %pb, align 16
    %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -831,13 +831,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  ; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: insertps_from_vector_load_offset_2:
  ; X64:       ## BB#0:
  ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3]
+; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  ; X64-NEXT:    retq
    %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
    %2 = load <4 x float>, <4 x float>* %1, align 16
@@ -968,12 +968,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
  ; X32-LABEL: pr20087:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: pr20087:
  ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
  ; X64-NEXT:    retq
    %load = load <4 x float> , <4 x float> *%ptr
    %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll

index c7a2143b5b28022eb48138dd4bdef3898759f905..b86ec0ea22ff1677c1f939cfd7dcf29f008a1c9d 100644 (file)
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -946,7 +946,15 @@ define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
    ret <8 x float> %2
  }
  
-; TODO stack_fold_insertps
+define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_insertps
+  ;CHECK:       vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK-NEXT:                                                                              {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
  
  define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
    ;CHECK-LABEL: stack_fold_maxpd
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll

index e90a89d36c09257fc47618d449a0fca8c10af6f9..105115bc7d25c0c64dc11c278de40aef69c84640 100644 (file)
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -637,7 +637,15 @@ define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
  }
  declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
  
-; TODO stack_fold_insertps
+define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_insertps
+  ;CHECK:       insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK-NEXT:                                                        {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
  
  define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
    ;CHECK-LABEL: stack_fold_maxpd
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Wed, 4 Nov 2015 20:48:09 +0000 (20:48 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Wed, 4 Nov 2015 20:48:09 +0000 (20:48 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.h		patch \| blob \| history
test/CodeGen/X86/avx.ll		patch \| blob \| history
test/CodeGen/X86/insertps-unfold-load-bug.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/sse41.ll		patch \| blob \| history
test/CodeGen/X86/stack-folding-fp-avx1.ll		patch \| blob \| history
test/CodeGen/X86/stack-folding-fp-sse42.ll		patch \| blob \| history