[X86] Allow merging of immediates within a basic block for code size savings

author Michael Kuperstein <michael.m.kuperstein@intel.com>

Tue, 11 Aug 2015 14:10:58 +0000 (14:10 +0000)

committer Michael Kuperstein <michael.m.kuperstein@intel.com>

Tue, 11 Aug 2015 14:10:58 +0000 (14:10 +0000)
author Michael Kuperstein <michael.m.kuperstein@intel.com>
Tue, 11 Aug 2015 14:10:58 +0000 (14:10 +0000)
committer Michael Kuperstein <michael.m.kuperstein@intel.com>
Tue, 11 Aug 2015 14:10:58 +0000 (14:10 +0000)
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index 34ea3b781096cc14504434aedcb88e16e54ce962..d37db7f788b285e216057516d77e8ac6534118cb 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -283,6 +283,82 @@ namespace {
          Segment = CurDAG->getRegister(0, MVT::i32);
      }
  
+    // Utility function to determine whether we should avoid selecting
+    // immediate forms of instructions for better code size or not.
+    // At a high level, we'd like to avoid such instructions when
+    // we have similar constants used within the same basic block
+    // that can be kept in a register.
+    //
+    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+      uint32_t UseCount = 0;
+
+      // Do not want to hoist if we're not optimizing for size.
+      // TODO: We'd like to remove this restriction.
+      // See the comment in X86InstrInfo.td for more info.
+      if (!OptForSize)
+        return false;
+
+      // Walk all the users of the immediate.
+      for (SDNode::use_iterator UI = N->use_begin(),
+           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+        SDNode *User = *UI;
+
+        // This user is already selected. Count it as a legitimate use and
+        // move on.
+        if (User->isMachineOpcode()) {
+          UseCount++;
+          continue;
+        }
+
+        // We want to count stores of immediates as real uses.
+        if (User->getOpcode() == ISD::STORE &&
+            User->getOperand(1).getNode() == N) {
+          UseCount++;
+          continue;
+        }
+
+        // We don't currently match users that have > 2 operands (except
+        // for stores, which are handled above)
+        // Those instruction won't match in ISEL, for now, and would
+        // be counted incorrectly.
+        // This may change in the future as we add additional instruction
+        // types.
+        if (User->getNumOperands() != 2)
+          continue;
+        
+        // Immediates that are used for offsets as part of stack
+        // manipulation should be left alone. These are typically
+        // used to indicate SP offsets for argument passing and
+        // will get pulled into stores/pushes (implicitly).
+        if (User->getOpcode() == X86ISD::ADD ||
+            User->getOpcode() == ISD::ADD    ||
+            User->getOpcode() == X86ISD::SUB ||
+            User->getOpcode() == ISD::SUB) {
+
+          // Find the other operand of the add/sub.
+          SDValue OtherOp = User->getOperand(0);
+          if (OtherOp.getNode() == N)
+            OtherOp = User->getOperand(1);
+
+          // Don't count if the other operand is SP.
+          RegisterSDNode *RegNode;
+          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+              (RegNode = dyn_cast_or_null<RegisterSDNode>(
+                 OtherOp->getOperand(1).getNode())))
+            if ((RegNode->getReg() == X86::ESP) ||
+                (RegNode->getReg() == X86::RSP))
+              continue;
+        }
+
+        // ... otherwise, count this and move on.
+        UseCount++;
+      }
+
+      // If we have more than 1 use, then recommend for hoisting.
+      return (UseCount > 1);
+    }
+
      /// getI8Imm - Return a target constant with the specified value, of type
      /// i8.
      inline SDValue getI8Imm(unsigned Imm, SDLoc DL) {
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td

index 5e19ad448fc7fcfade63bbc2428900eebd7a1be4..64807aebd302214c59a9389abccdf3c7f7825279 100644 (file)
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -615,14 +615,14 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
  def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
  
  
-def Xi8  : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
-                       Imm8 , i8imm ,    imm,          i8imm   , invalid_node,
+def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+                       Imm8, i8imm, imm8_su, i8imm, invalid_node,
                         0, OpSizeFixed, 0>;
  def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
-                       Imm16, i16imm,    imm,          i16i8imm, i16immSExt8,
+                       Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
                         1, OpSize16, 0>;
  def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
-                       Imm32, i32imm,    imm,          i32i8imm, i32immSExt8,
+                       Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
                         1, OpSize32, 0>;
  def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
                         Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td

index d626e078110be66addfe30b61c22956c4bba08d4..17b13e3220df35fd97a55bb74e2c434d9b65c216 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -873,6 +873,40 @@ def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
  def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
  def i64immSExt8  : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
  
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
  
  def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
  
@@ -1283,13 +1317,13 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
  let SchedRW = [WriteStore] in {
  def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                     "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+                   [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
  def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                     "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+                   [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
  def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                     "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+                   [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
  def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                         "mov{q}\t{$src, $dst|$dst, $src}",
                         [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
diff --git a/test/CodeGen/X86/immediate_merging.ll b/test/CodeGen/X86/immediate_merging.ll

new file mode 100644 (file)

index 0000000..8aef9c2
--- /dev/null
+++ b/test/CodeGen/X86/immediate_merging.ll
@@ -0,0 +1,82 @@
+; RUN: llc -o - -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@c = common global i32 0, align 4
+@e = common global i32 0, align 4
+@x = common global i32 0, align 4
+@f = common global i32 0, align 4
+@h = common global i32 0, align 4
+@i = common global i32 0, align 4
+
+; Test -Os to make sure immediates with multiple users don't get pulled in to
+; instructions.
+define i32 @foo() optsize {
+; CHECK-LABEL: foo:
+; CHECK: movl $1234, [[R1:%[a-z]+]]
+; CHECK-NOT: movl $1234, a
+; CHECK-NOT: movl $1234, b
+; CHECK-NOT: movl $12, c
+; CHECK-NOT: cmpl $12, e
+; CHECK: movl [[R1]], a
+; CHECK: movl [[R1]], b
+
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+  store i32 12, i32* @c
+  %0 = load i32, i32* @e
+  %cmp = icmp eq i32 %0, 12
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @x
+  br label %if.end
+
+; New block.. Make sure 1234 isn't live across basic blocks from before.
+; CHECK: movl $1234, f
+; CHECK: movl $555, [[R3:%[a-z]+]]
+; CHECK-NOT: movl $555, h
+; CHECK-NOT: addl $555, i
+; CHECK: movl [[R3]], h
+; CHECK: addl [[R3]], i
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1234, i32* @f
+  store i32 555, i32* @h
+  %1 = load i32, i32* @i
+  %add1 = add nsw i32 %1, 555
+  store i32 %add1, i32* @i
+  ret i32 0
+}
+
+; Test -O2 to make sure that all immediates get pulled in to their users.
+define i32 @foo2() {
+; CHECK-LABEL: foo2:
+; CHECK: movl $1234, a
+; CHECK: movl $1234, b
+
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+
+  ret i32 0
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #1
+
+@AA = common global [100 x i8] zeroinitializer, align 1
+
+; memset gets lowered in DAG. Constant merging should hoist all the
+; immediates used to store to the individual memory locations. Make
+; sure we don't directly store the immediates.
+define void @foomemset() optsize {
+; CHECK-LABEL: foomemset:
+; CHECK-NOT: movl ${{.*}}, AA
+; CHECK: mov{{l|q}} %{{e|r}}ax, AA
+
+entry:
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/X86/remat-invalid-liveness.ll b/test/CodeGen/X86/remat-invalid-liveness.ll

deleted file mode 100644 (file)

index c6b43b0..0000000
--- a/test/CodeGen/X86/remat-invalid-liveness.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; RUN: llc %s -mcpu=core2 -o - | FileCheck %s
-; This test was failing while tracking the liveness in the register scavenger
-; during the branching folding pass. The allocation of the subregisters was
-; incorrect.
-; I.e., the faulty pattern looked like:
-; CH = movb 64
-; ECX = movl 3 <- CH was killed here.
-; CH = subb CH, ...
-;
-; This reduced test case triggers the crash before the fix, but does not
-; strictly speaking check that the resulting code is correct.
-; To check that the code is actually correct we would need to check the
-; liveness of the produced code.
-;
-; Currently, we check that after ECX = movl 3, we do not have subb CH,
-; whereas CH could have been redefine in between and that would have been
-; totally fine.
-; <rdar://problem/16582185>
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.9"
-
-%struct.A = type { %struct.B, %struct.C, %struct.D*, [1 x i8*] }
-%struct.B = type { i32, [4 x i8] }
-%struct.C = type { i128 }
-%struct.D = type { {}*, [0 x i32] }
-%union.E = type { i32 }
-
-; CHECK-LABEL: __XXX1:
-; CHECK: movl $3, %ecx
-; CHECK-NOT: subb %{{[a-z]+}}, %ch
-; Function Attrs: nounwind optsize ssp
-define fastcc void @__XXX1(%struct.A* %ht) #0 {
-entry:
-  %const72 = bitcast i128 72 to i128
-  %const3 = bitcast i128 3 to i128
-  switch i32 undef, label %if.end196 [
-    i32 1, label %sw.bb.i
-    i32 3, label %sw.bb2.i
-  ]
-
-sw.bb.i:                                          ; preds = %entry
-  %call.i.i.i = tail call i32 undef(%struct.A* %ht, i8 zeroext 22, i32 undef, i32 0, %struct.D* undef)
-  %bf.load.i.i = load i128, i128* undef, align 4
-  %bf.lshr.i.i = lshr i128 %bf.load.i.i, %const72
-  %shl1.i.i = shl nuw nsw i128 %bf.lshr.i.i, 8
-  %shl.i.i = trunc i128 %shl1.i.i to i32
-  br i1 undef, label %cond.false10.i.i, label %__XXX2.exit.i.i
-
-__XXX2.exit.i.i:                    ; preds = %sw.bb.i
-  %extract11.i.i.i = lshr i128 %bf.load.i.i, %const3
-  %extract.t12.i.i.i = trunc i128 %extract11.i.i.i to i32
-  %bf.cast7.i.i.i = and i32 %extract.t12.i.i.i, 3
-  %arrayidx.i.i.i = getelementptr inbounds %struct.A, %struct.A* %ht, i32 0, i32 3, i32 %bf.cast7.i.i.i
-  br label %cond.end12.i.i
-
-cond.false10.i.i:                                 ; preds = %sw.bb.i
-  %arrayidx.i6.i.i = getelementptr inbounds %struct.A, %struct.A* %ht, i32 0, i32 3, i32 0
-  br label %cond.end12.i.i
-
-cond.end12.i.i:                                   ; preds = %cond.false10.i.i, %__XXX2.exit.i.i
-  %.sink.in.i.i = phi i8** [ %arrayidx.i.i.i, %__XXX2.exit.i.i ], [ %arrayidx.i6.i.i, %cond.false10.i.i ]
-  %.sink.i.i = load i8*, i8** %.sink.in.i.i, align 4
-  %tmp = bitcast i8* %.sink.i.i to %union.E*
-  br i1 undef, label %for.body.i.i, label %if.end196
-
-for.body.i.i:                                     ; preds = %for.body.i.i, %cond.end12.i.i
-  %weak.i.i = getelementptr inbounds %union.E, %union.E* %tmp, i32 undef, i32 0
-  %tmp1 = load i32, i32* %weak.i.i, align 4
-  %cmp36.i.i = icmp ne i32 %tmp1, %shl.i.i
-  %or.cond = and i1 %cmp36.i.i, false
-  br i1 %or.cond, label %for.body.i.i, label %if.end196
-
-sw.bb2.i:                                         ; preds = %entry
-  %bf.lshr.i85.i = lshr i128 undef, %const72
-  br i1 undef, label %if.end196, label %__XXX2.exit.i95.i
-
-__XXX2.exit.i95.i:                  ; preds = %sw.bb2.i
-  %extract11.i.i91.i = lshr i128 undef, %const3
-  br label %if.end196
-
-if.end196:                                        ; preds = %__XXX2.exit.i95.i, %sw.bb2.i, %for.body.i.i, %cond.end12.i.i, %entry
-  ret void
-}
-
-attributes #0 = { nounwind optsize ssp "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
author	Michael Kuperstein <michael.m.kuperstein@intel.com>
	Tue, 11 Aug 2015 14:10:58 +0000 (14:10 +0000)
committer	Michael Kuperstein <michael.m.kuperstein@intel.com>
	Tue, 11 Aug 2015 14:10:58 +0000 (14:10 +0000)
lib/Target/X86/X86ISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/X86/X86InstrArithmetic.td		patch \| blob \| history
lib/Target/X86/X86InstrInfo.td		patch \| blob \| history
test/CodeGen/X86/immediate_merging.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/remat-invalid-liveness.ll	[deleted file]	patch \| blob \| history