From f329765d2304e2ab3eccd7760727513539874c93 Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Thu, 11 Dec 2014 22:51:06 +0000
Subject: [PATCH] [PowerPC] Better lowering for add/or of a FrameIndex

If we have an add (or an or that is really an add), where one operand is a
FrameIndex and the other operand is a small constant, we can combine the
lowering of the FrameIndex (which is lowered as an add of the FI and a zero
offset) with the constant operand.

Amusingly, this is an old potential improvement entry from
lib/Target/PowerPC/README.txt which had never been resolved. In short, we used
to lower:

        %X = alloca { i32, i32 }
        %Y = getelementptr {i32,i32}* %X, i32 0, i32 1
        ret i32* %Y

as:

        addi 3, 1, -8
        ori 3, 3, 4
        blr

and now we produce:

        addi 3, 1, -4
        blr

which is much more sensible.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224071 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp  | 49 +++++++++++++++++++------
 lib/Target/PowerPC/README.txt           | 19 ----------
 test/CodeGen/PowerPC/add-fi.ll          | 24 ++++++++++++
 test/CodeGen/PowerPC/ppc64-vaarg-int.ll |  2 +-
 4 files changed, 63 insertions(+), 31 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/add-fi.ll
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 48c0388ceae..2e10fee3a93 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -112,6 +112,8 @@ namespace {
     /// base register.  Return the virtual register that holds this value.
     SDNode *getGlobalBaseReg();
 
+    SDNode *getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
+
     // Select - Convert the specified operand from a target-independent to a
     // target-specific node if it hasn't already been changed.
     SDNode *Select(SDNode *N) override;
@@ -373,6 +375,18 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          && isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
+SDNode *PPCDAGToDAGISel::getFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
+  SDLoc dl(SN);
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+  unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+  if (SN->hasOneUse())
+    return CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
+                                getSmallIPtrImm(Offset));
+  return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                getSmallIPtrImm(Offset));
+}
+
 bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
   if (!Val)
     return false;
@@ -1019,16 +1033,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
-  case ISD::FrameIndex: {
-    int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
-    unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
-    if (N->hasOneUse())
-      return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI,
-                                  getSmallIPtrImm(0));
-    return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
-                                  getSmallIPtrImm(0));
-  }
+  case ISD::FrameIndex:
+    return getFrameIndex(N, N);
 
   case PPCISD::MFOCRF: {
     SDValue InFlag = N->getOperand(1);
@@ -1213,13 +1219,34 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
-  case ISD::OR:
+  case ISD::OR: {
     if (N->getValueType(0) == MVT::i32)
       if (SDNode *I = SelectBitfieldInsert(N))
         return I;
 
+    short Imm;
+    if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+        isIntS16Immediate(N->getOperand(1), Imm)) {
+      APInt LHSKnownZero, LHSKnownOne;
+      CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne);
+
+      // If this is equivalent to an add, then we can fold it with the
+      // FrameIndex calculation.
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL)
+        return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+    }
+
     // Other cases are autogenerated.
     break;
+  }
+  case ISD::ADD: {
+    short Imm;
+    if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+        isIntS16Immediate(N->getOperand(1), Imm))
+      return getFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+
+    break;
+  }
   case ISD::SHL: {
     unsigned Imm, SH, MB, ME;
     if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index 514f8407c97..f96443c008f 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -184,25 +184,6 @@ http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 
 ===-------------------------------------------------------------------------===
 
-Compile offsets from allocas:
-
-int *%test() {
-        %X = alloca { int, int }
-        %Y = getelementptr {int,int}* %X, int 0, uint 1
-        ret int* %Y
-}
-
-into a single add, not two:
-
-_test:
-        addi r2, r1, -8
-        addi r3, r2, 4
-        blr
-
---> important for C++.
-
-===-------------------------------------------------------------------------===
-
 No loads or stores of the constants should be needed:
 
 struct foo { double X, Y; };
diff --git a/test/CodeGen/PowerPC/add-fi.ll b/test/CodeGen/PowerPC/add-fi.ll
new file mode 100644
index 00000000000..18892c8cdf5
--- /dev/null
+++ b/test/CodeGen/PowerPC/add-fi.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32* @test1() {
+        %X = alloca { i32, i32 }
+        %Y = getelementptr {i32,i32}* %X, i32 0, i32 1
+        ret i32* %Y
+
+; CHECK-LABEL: @test1
+; CHECK: addi 3, 1, -4
+; CHECK: blr
+}
+
+define i32* @test2() {
+        %X = alloca { i32, i32, i32, i32 }
+        %Y = getelementptr {i32,i32,i32,i32}* %X, i32 0, i32 3
+        ret i32* %Y
+
+; CHECK-LABEL: @test2
+; CHECK: addi 3, 1, -4
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-vaarg-int.ll b/test/CodeGen/PowerPC/ppc64-vaarg-int.ll
index 5a63b01badc..c9a4f91fdde 100644
--- a/test/CodeGen/PowerPC/ppc64-vaarg-int.ll
+++ b/test/CodeGen/PowerPC/ppc64-vaarg-int.ll
@@ -16,5 +16,5 @@ declare void @llvm.va_start(i8*) nounwind
 
 ; CHECK: @intvaarg
 ; Make sure that the va pointer is incremented by 8 (not 4).
-; CHECK: addi{{.*}}, 8
+; CHECK: addi{{.*}}, 1, 64 
 
-- 
2.34.1