- Two-address pass should not assume unfolding is always successful.
authorEvan Cheng <evan.cheng@apple.com>
Fri, 2 Jul 2010 20:36:18 +0000 (20:36 +0000)
committerEvan Cheng <evan.cheng@apple.com>
Fri, 2 Jul 2010 20:36:18 +0000 (20:36 +0000)
- X86 unfolding should check if the instructions being unfolded has memoperands.
  If there is no memoperands, then it must assume conservative alignment. If this
  would introduce an expensive sse unaligned load / store, then unfoldMemoryOperand
  etc. should not unfold the instruction.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@107509 91177308-0d34-0410-b5e6-96231b3b80d8

lib/CodeGen/TwoAddressInstructionPass.cpp
lib/Target/X86/X86InstrInfo.cpp
test/CodeGen/X86/2010-07-02-UnfoldBug.ll [new file with mode: 0644]

index 0c97dad48b824ea44db2cb5f16ebbc5541ee3605..62fa0fdb7716fb5864b95d25341ca61b8ff4487d 100644 (file)
@@ -926,14 +926,12 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
           UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
         unsigned Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
-        bool Success =
-          TII->unfoldMemoryOperand(MF, mi, Reg,
-                                   /*UnfoldLoad=*/true, /*UnfoldStore=*/false,
-                                   NewMIs);
-        (void)Success;
-        assert(Success &&
-               "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
-               "succeeded!");
+        if (!TII->unfoldMemoryOperand(MF, mi, Reg,
+                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
+                                      NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
         assert(NewMIs.size() == 2 &&
                "Unfolded a load into multiple instructions!");
         // The load was previously folded, so this is the only use.
index caa623399debb3ff6f32deeaa4bd07755e063f05..c1d66cb570222a3a40eba9fc594bea530cce506b 100644 (file)
@@ -2159,7 +2159,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -2189,7 +2189,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
@@ -2693,6 +2693,13 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   const TargetInstrDesc &TID = get(Opc);
   const TargetOperandInfo &TOI = TID.OpInfo[Index];
   const TargetRegisterClass *RC = TOI.getRegClass(&RI);
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
   SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
   SmallVector<MachineOperand,2> BeforeOps;
   SmallVector<MachineOperand,2> AfterOps;
@@ -2834,7 +2841,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                             cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned load.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
                               VT, MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
@@ -2871,7 +2883,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                              cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned store.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
     SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
                                                          isAligned, TM),
                                        dl, MVT::Other,
diff --git a/test/CodeGen/X86/2010-07-02-UnfoldBug.ll b/test/CodeGen/X86/2010-07-02-UnfoldBug.ll
new file mode 100644 (file)
index 0000000..79219dc
--- /dev/null
@@ -0,0 +1,99 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin
+; rdar://8154265
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define void @_ZN2CA3OGL20fill_surface_mesh_3dERNS0_7ContextEPKNS_6Render13MeshTransformEPKNS0_5LayerEPNS0_7SurfaceEfNS0_13TextureFilterESC_f() nounwind optsize ssp {
+entry:
+  br i1 undef, label %bb2.thread, label %bb2
+
+bb2.thread:                                       ; preds = %entry
+  br i1 undef, label %bb41, label %bb10.preheader
+
+bb2:                                              ; preds = %entry
+  unreachable
+
+bb10.preheader:                                   ; preds = %bb2.thread
+  br i1 undef, label %bb9, label %bb12
+
+bb9:                                              ; preds = %bb9, %bb10.preheader
+  br i1 undef, label %bb9, label %bb12
+
+bb12:                                             ; preds = %bb9, %bb10.preheader
+  br i1 undef, label %bb4.i.i, label %bb3.i.i
+
+bb3.i.i:                                          ; preds = %bb12
+  unreachable
+
+bb4.i.i:                                          ; preds = %bb12
+  br i1 undef, label %bb8.i.i, label %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
+
+bb8.i.i:                                          ; preds = %bb4.i.i
+  br i1 undef, label %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit, label %bb9.i.i
+
+bb9.i.i:                                          ; preds = %bb8.i.i
+  br i1 undef, label %bb11.i.i, label %bb10.i.i
+
+bb10.i.i:                                         ; preds = %bb9.i.i
+  unreachable
+
+bb11.i.i:                                         ; preds = %bb9.i.i
+  unreachable
+
+_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit: ; preds = %bb8.i.i, %bb4.i.i
+  br i1 undef, label %bb19, label %bb14
+
+bb14:                                             ; preds = %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
+  unreachable
+
+bb19:                                             ; preds = %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
+  br i1 undef, label %bb.i50, label %bb6.i
+
+bb.i50:                                           ; preds = %bb19
+  unreachable
+
+bb6.i:                                            ; preds = %bb19
+  br i1 undef, label %bb28, label %bb.nph106
+
+bb22:                                             ; preds = %bb24.preheader
+  br i1 undef, label %bb2.i.i, label %bb.i.i49
+
+bb.i.i49:                                         ; preds = %bb22
+  %0 = load float* undef, align 4                 ; <float> [#uses=1]
+  %1 = insertelement <4 x float> undef, float %0, i32 0 ; <<4 x float>> [#uses=1]
+  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> %1) nounwind readnone ; <<4 x float>> [#uses=1]
+  %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %2, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) nounwind readnone ; <<4 x float>> [#uses=1]
+  %4 = extractelement <4 x float> %3, i32 0       ; <float> [#uses=1]
+  store float %4, float* undef, align 4
+  %5 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> undef) nounwind readnone ; <<4 x float>> [#uses=1]
+  %6 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %5, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) nounwind readnone ; <<4 x float>> [#uses=1]
+  %7 = extractelement <4 x float> %6, i32 0       ; <float> [#uses=1]
+  store float %7, float* undef, align 4
+  unreachable
+
+bb2.i.i:                                          ; preds = %bb22
+  unreachable
+
+bb26.loopexit:                                    ; preds = %bb24.preheader
+  br i1 undef, label %bb28, label %bb24.preheader
+
+bb.nph106:                                        ; preds = %bb6.i
+  br label %bb24.preheader
+
+bb24.preheader:                                   ; preds = %bb.nph106, %bb26.loopexit
+  br i1 undef, label %bb22, label %bb26.loopexit
+
+bb28:                                             ; preds = %bb26.loopexit, %bb6.i
+  unreachable
+
+bb41:                                             ; preds = %bb2.thread
+  br i1 undef, label %return, label %bb46
+
+bb46:                                             ; preds = %bb41
+  ret void
+
+return:                                           ; preds = %bb41
+  ret void
+}