From: Hal Finkel <hfinkel@anl.gov>
Date: Tue, 9 Jul 2013 06:34:51 +0000 (+0000)
Subject: PPC: Allocate RS spill slot for unaligned i64 load/store
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=fa55969acb64da32acf6305064c9f6e3c237b74e;p=oota-llvm.git

PPC: Allocate RS spill slot for unaligned i64 load/store

This fixes another bug found by llvm-stress!

If we happen to be doing an i64 load or store into a stack slot that has less
than a 4-byte alignment, then the frame-index elimination may need to use an
indexed load or store instruction (because the offset may not be a multiple of
4, a requirement of the STD/LD instructions). The extra register needed to hold
the offset comes from the register scavenger, and it is possible that the
scavenger will need to use an emergency spill slot. As a result, we need to
make sure that a spill slot is allocated when doing an i64 load/store into a
less-than-4-byte-aligned stack slot.

Because test cases for things like this tend to be fairly fragile, I've
concatenated a few small bugpoint-reduced test cases together to form the
regression test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185907 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1759c0415e5..cf41c02e749 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1030,6 +1030,35 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
   return false;
 }
 
+// If we happen to be doing an i64 load or store into a stack slot that has
+// less than a 4-byte alignment, then the frame-index elimination may need to
+// use an indexed load or store instruction (because the offset may not be a
+// multiple of 4). The extra register needed to hold the offset comes from the
+// register scavenger, and it is possible that the scavenger will need to use
+// an emergency spill slot. As a result, we need to make sure that a spill slot
+// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
+// stack slot.
+static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
+  // FIXME: This does not handle the LWA case.
+  if (VT != MVT::i64)
+    return;
+
+  // This should not be needed for negative FIs, which come from argument
+  // lowering, because the ABI should guarentee the necessary alignment.
+  if (FrameIdx < 0)
+    return;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+  if (Align >= 4)
+    return;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasNonRISpills();
+}
+
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
 /// represented as reg+reg.  If Aligned is true, only accept displacements
@@ -1051,6 +1080,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       Disp = DAG.getTargetConstant(imm, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+        fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
       } else {
         Base = N.getOperand(0);
       }
@@ -1115,9 +1145,10 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   }
 
   Disp = DAG.getTargetConstant(0, getPointerTy());
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
     Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
-  else
+    fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+  } else
     Base = N;
   return true;      // [r+0]
 }
diff --git a/test/CodeGen/PowerPC/std-unal-fi.ll b/test/CodeGen/PowerPC/std-unal-fi.ll
new file mode 100644
index 00000000000..8b9606e1624
--- /dev/null
+++ b/test/CodeGen/PowerPC/std-unal-fi.ll
@@ -0,0 +1,119 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @autogen_SD4932(i8) {
+BB:
+  %A4 = alloca i8
+  %A = alloca <1 x ppc_fp128>
+  %Shuff = shufflevector <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 undef, i32 29, i32 31, i32 1, i32 3, i32 5>
+  br label %CF
+
+CF:                                               ; preds = %CF80, %CF, %BB
+  %L5 = load i64* undef
+  store i8 %0, i8* %A4
+  %Shuff7 = shufflevector <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> %Shuff, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 undef, i32 20, i32 22, i32 24, i32 26>
+  %PC10 = bitcast i8* %A4 to ppc_fp128*
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF81, %CF83, %CF77, %CF
+  br i1 undef, label %CF77, label %CF82
+
+CF82:                                             ; preds = %CF82, %CF77
+  %L19 = load i64* undef
+  store <1 x ppc_fp128> zeroinitializer, <1 x ppc_fp128>* %A
+  store i8 -65, i8* %A4
+  br i1 undef, label %CF82, label %CF83
+
+CF83:                                             ; preds = %CF82
+  %L34 = load i64* undef
+  br i1 undef, label %CF77, label %CF81
+
+CF81:                                             ; preds = %CF83
+  %Shuff43 = shufflevector <16 x i32> %Shuff7, <16 x i32> undef, <16 x i32> <i32 15, i32 17, i32 19, i32 21, i32 23, i32 undef, i32 undef, i32 29, i32 31, i32 undef, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+  store ppc_fp128 0xM00000000000000000000000000000000, ppc_fp128* %PC10
+  br i1 undef, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF81
+  br i1 undef, label %CF78, label %CF79
+
+CF79:                                             ; preds = %CF79, %CF78
+  br i1 undef, label %CF79, label %CF80
+
+CF80:                                             ; preds = %CF79
+  store i64 %L19, i64* undef
+  %Cmp75 = icmp uge i32 206779, undef
+  br i1 %Cmp75, label %CF, label %CF76
+
+CF76:                                             ; preds = %CF80
+  store i64 %L5, i64* undef
+  store i64 %L34, i64* undef
+  ret void
+}
+
+define void @autogen_SD88042(i8*, i32*, i8) {
+BB:
+  %A4 = alloca <2 x i1>
+  %A = alloca <16 x float>
+  %L = load i8* %0
+  %Sl = select i1 false, <16 x float>* %A, <16 x float>* %A
+  %PC = bitcast <2 x i1>* %A4 to i64*
+  %Sl27 = select i1 false, i8 undef, i8 %L
+  br label %CF
+
+CF:                                               ; preds = %CF78, %CF, %BB
+  %PC33 = bitcast i32* %1 to i32*
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF80, %CF77, %CF
+  store <16 x float> zeroinitializer, <16 x float>* %Sl
+  %L58 = load i32* %PC33
+  store i8 0, i8* %0
+  br i1 undef, label %CF77, label %CF80
+
+CF80:                                             ; preds = %CF77
+  store i64 0, i64* %PC
+  %E67 = extractelement <8 x i1> zeroinitializer, i32 1
+  br i1 %E67, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF80
+  %Cmp73 = icmp eq i32 189865, %L58
+  br i1 %Cmp73, label %CF, label %CF76
+
+CF76:                                             ; preds = %CF78
+  store i8 %2, i8* %0
+  store i8 %Sl27, i8* %0
+  ret void
+}
+
+define void @autogen_SD37497(i8*, i32*, i64*) {
+BB:
+  %A1 = alloca i1
+  %I8 = insertelement <1 x i32> <i32 -1>, i32 454855, i32 0
+  %Cmp = icmp ult <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, undef
+  %L10 = load i64* %2
+  %E11 = extractelement <4 x i1> %Cmp, i32 2
+  br label %CF72
+
+CF72:                                             ; preds = %CF74, %CF72, %BB
+  store double 0xB47BB29A53790718, double* undef
+  %E18 = extractelement <1 x i32> <i32 -1>, i32 0
+  %FC22 = sitofp <1 x i32> %I8 to <1 x float>
+  br i1 undef, label %CF72, label %CF74
+
+CF74:                                             ; preds = %CF72
+  store i8 0, i8* %0
+  %PC = bitcast i1* %A1 to i64*
+  %L31 = load i64* %PC
+  store i64 477323, i64* %PC
+  %Sl37 = select i1 false, i32* undef, i32* %1
+  %Cmp38 = icmp ugt i1 undef, undef
+  br i1 %Cmp38, label %CF72, label %CF73
+
+CF73:                                             ; preds = %CF74
+  store i64 %L31, i64* %PC
+  %B55 = fdiv <1 x float> undef, %FC22
+  %Sl63 = select i1 %E11, i32* undef, i32* %Sl37
+  store i32 %E18, i32* %Sl63
+  store i64 %L10, i64* %PC
+  ret void
+}