From 9620204c3b5e96e55e1eb7ab6ac2fe9d62146995 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 2 Nov 2015 23:22:49 +0000
Subject: [PATCH] [CGP] widen switch condition and case constants to target's
 register width (2nd try)

This is a redo of r251849 except the tests have been split into arch-specific folders
to hopefully make the bots happy.

This is a follow-up from the discussion in D12965. The block-at-a-time limitation of
SelectionDAG also came up in D13297.

Without the InstCombine change from D12965, I don't expect this patch to make any
difference in the real world because InstCombine does not shrink cases like this in
visitSwitchInst(). But we need to have this CGP safety harness in place before
proceeding with any shrinkage in D12965, so we won't generate extra extends for compares.

I've opted for IR regression tests in the patch because that seems like a clearer way to
test the transform, but PowerPC CodeGen for an i16 widening test is shown below. x86
will need more work to solve: https://llvm.org/bugs/show_bug.cgi?id=22473

Before:
BB#0:
  mr 4, 3
  extsh. 3, 4
  ble 0, .LBB0_5
 BB#1:
  cmpwi  3, 99
  bgt    0, .LBB0_9
 BB#2:
  rlwinm 4, 4, 0, 16, 31      <--- 32-bit mask/extend
  li 3, 0
  cmplwi         4, 1
  beqlr 0
 BB#3:
  cmplwi         4, 10
  bne    0, .LBB0_12
 BB#4:
  li 3, 1
  blr
.LBB0_5:
  rlwinm 3, 4, 0, 16, 31      <--- 32-bit mask/extend
  cmplwi         3, 65436
  beq    0, .LBB0_13
 BB#6:
  cmplwi         3, 65526
  beq    0, .LBB0_15
 BB#7:
  cmplwi         3, 65535
  bne    0, .LBB0_12
 BB#8:
  li 3, 4
  blr
.LBB0_9:
  rlwinm 3, 4, 0, 16, 31      <--- 32-bit mask/extend
  cmplwi         3, 100
  beq    0, .LBB0_14
...

After:
BB#0:
  rlwinm 4, 3, 0, 16, 31      <--- mask/extend to 32-bit and then use that for comparisons
  cmpwi  4, 999
  ble 0, .LBB0_5
 BB#1:
  lis 3, 0
  ori 3, 3, 65525
  cmpw   4, 3
  bgt    0, .LBB0_9
 BB#2:
  cmplwi         4, 1000
  beq    0, .LBB0_14
 BB#3:
  cmplwi         4, 65436
  bne    0, .LBB0_13
 BB#4:
  li 3, 6
  blr
.LBB0_5:
  li 3, 0
  cmplwi         4, 1
  beqlr 0
 BB#6:
  cmplwi         4, 10
  beq    0, .LBB0_12
 BB#7:
  cmplwi         4, 100
  bne    0, .LBB0_13
 BB#8:
  li 3, 2
  blr
.LBB0_9:
  cmplwi         4, 65526
  beq    0, .LBB0_15
 BB#10:
  cmplwi         4, 65535
  bne    0, .LBB0_13
...


Differential Revision: http://reviews.llvm.org/D13532



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251857 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 47 +++++++++
 .../CodeGenPrepare/AArch64/widen_switch.ll    | 95 +++++++++++++++++++
 .../CodeGenPrepare/X86/widen_switch.ll        | 95 +++++++++++++++++++
 3 files changed, 237 insertions(+)
 create mode 100644 test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll
 create mode 100644 test/Transforms/CodeGenPrepare/X86/widen_switch.ll
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 87669d772d7..76ca1350b41 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -175,6 +175,7 @@ class TypePromotionTransaction;
     bool optimizeExtUses(Instruction *I);
     bool optimizeSelectInst(SelectInst *SI);
     bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool optimizeSwitchInst(SwitchInst *CI);
     bool optimizeExtractElementInst(Instruction *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
     bool placeDbgValues(Function &F);
@@ -4399,6 +4400,49 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
   return MadeChange;
 }
 
+bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
+  if (!TLI || !DL)
+    return false;
+
+  Value *Cond = SI->getCondition();
+  Type *OldType = Cond->getType();
+  LLVMContext &Context = Cond->getContext();
+  MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
+  unsigned RegWidth = RegType.getSizeInBits();
+
+  if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
+    return false;
+
+  // If the register width is greater than the type width, expand the condition
+  // of the switch instruction and each case constant to the width of the
+  // register. By widening the type of the switch condition, subsequent
+  // comparisons (for case comparisons) will not need to be extended to the
+  // preferred register width, so we will potentially eliminate N-1 extends,
+  // where N is the number of cases in the switch.
+  auto *NewType = Type::getIntNTy(Context, RegWidth);
+
+  // Zero-extend the switch condition and case constants unless the switch
+  // condition is a function argument that is already being sign-extended.
+  // In that case, we can avoid an unnecessary mask/extension by sign-extending
+  // everything instead.
+  Instruction::CastOps ExtType = Instruction::ZExt;
+  if (auto *Arg = dyn_cast<Argument>(Cond))
+    if (Arg->hasSExtAttr())
+      ExtType = Instruction::SExt;
+
+  auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
+  ExtInst->insertBefore(SI);
+  SI->setCondition(ExtInst);
+  for (SwitchInst::CaseIt Case : SI->cases()) {
+    APInt NarrowConst = Case.getCaseValue()->getValue();
+    APInt WideConst = (ExtType == Instruction::ZExt) ?
+                      NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
+    Case.setValue(ConstantInt::get(Context, WideConst));
+  }
+
+  return true;
+}
+
 namespace {
 /// \brief Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.
@@ -4871,6 +4915,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
     return optimizeShuffleVectorInst(SVI);
 
+  if (auto *Switch = dyn_cast<SwitchInst>(I))
+    return optimizeSwitchInst(Switch);
+
   if (isa<ExtractElementInst>(I))
     return optimizeExtractElementInst(I);
 
diff --git a/test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll b/test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll
new file mode 100644
index 00000000000..172541a4608
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/AArch64/widen_switch.ll
@@ -0,0 +1,95 @@
+;; AArch64 is arbitralily chosen as a 32/64-bit RISC representative to show the transform in all tests.
+
+; RUN: opt < %s -codegenprepare -S -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=ARM64
+
+; AArch64 widens to 32-bit.
+
+define i32 @widen_switch_i16(i32 %a)  {
+entry:
+  %trunc = trunc i32 %a to i16
+  switch i16 %trunc, label %sw.default [
+    i16 1, label %sw.bb0
+    i16 -1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
+  ret i32 %retval
+
+; ARM64-LABEL: @widen_switch_i16(
+; ARM64:       %0 = zext i16 %trunc to i32
+; ARM64-NEXT:  switch i32 %0, label %sw.default [
+; ARM64-NEXT:    i32 1, label %return
+; ARM64-NEXT:    i32 65535, label %sw.bb1
+}
+
+; Widen to 32-bit from a smaller, non-native type.
+
+define i32 @widen_switch_i17(i32 %a)  {
+entry:
+  %trunc = trunc i32 %a to i17
+  switch i17 %trunc, label %sw.default [
+    i17 10, label %sw.bb0
+    i17 -1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
+  ret i32 %retval
+
+; ARM64-LABEL: @widen_switch_i17(
+; ARM64:       %0 = zext i17 %trunc to i32
+; ARM64-NEXT:  switch i32 %0, label %sw.default [
+; ARM64-NEXT:    i32 10, label %return
+; ARM64-NEXT:    i32 131071, label %sw.bb1
+}
+
+; If the switch condition is a sign-extended function argument, then the
+; condition and cases should be sign-extended rather than zero-extended
+; because the sign-extension can be optimized away.
+
+define i32 @widen_switch_i16_sext(i2 signext %a)  {
+entry:
+  switch i2 %a, label %sw.default [
+    i2 1, label %sw.bb0
+    i2 -1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
+  ret i32 %retval
+
+; ARM64-LABEL: @widen_switch_i16_sext(
+; ARM64:       %0 = sext i2 %a to i32
+; ARM64-NEXT:  switch i32 %0, label %sw.default [
+; ARM64-NEXT:    i32 1, label %return
+; ARM64-NEXT:    i32 -1, label %sw.bb1
+}
+
diff --git a/test/Transforms/CodeGenPrepare/X86/widen_switch.ll b/test/Transforms/CodeGenPrepare/X86/widen_switch.ll
new file mode 100644
index 00000000000..53c9cc07355
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/widen_switch.ll
@@ -0,0 +1,95 @@
+;; x86 is chosen to show the transform when 8-bit and 16-bit registers are available.
+
+; RUN: opt < %s -codegenprepare -S -mtriple=x86_64-unknown-unknown    | FileCheck %s --check-prefix=X86
+
+; No change for x86 because 16-bit registers are part of the architecture.
+
+define i32 @widen_switch_i16(i32 %a)  {
+entry:
+  %trunc = trunc i32 %a to i16
+  switch i16 %trunc, label %sw.default [
+    i16 1, label %sw.bb0
+    i16 -1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
+  ret i32 %retval
+
+; X86-LABEL: @widen_switch_i16(
+; X86:       %trunc = trunc i32 %a to i16
+; X86-NEXT:  switch i16 %trunc, label %sw.default [
+; X86-NEXT:    i16 1, label %return
+; X86-NEXT:    i16 -1, label %sw.bb1
+}
+
+; Widen to 32-bit from a smaller, non-native type.
+
+define i32 @widen_switch_i17(i32 %a)  {
+entry:
+  %trunc = trunc i32 %a to i17
+  switch i17 %trunc, label %sw.default [
+    i17 10, label %sw.bb0
+    i17 -1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
+  ret i32 %retval
+
+; X86-LABEL: @widen_switch_i17(
+; X86:       %0 = zext i17 %trunc to i32
+; X86-NEXT:  switch i32 %0, label %sw.default [
+; X86-NEXT:    i32 10, label %return
+; X86-NEXT:    i32 131071, label %sw.bb1
+}
+
+; If the switch condition is a sign-extended function argument, then the
+; condition and cases should be sign-extended rather than zero-extended
+; because the sign-extension can be optimized away.
+
+define i32 @widen_switch_i16_sext(i2 signext %a)  {
+entry:
+  switch i2 %a, label %sw.default [
+    i2 1, label %sw.bb0
+    i2 -1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
+  ret i32 %retval
+
+; X86-LABEL: @widen_switch_i16_sext(
+; X86:       %0 = sext i2 %a to i8
+; X86-NEXT:  switch i8 %0, label %sw.default [
+; X86-NEXT:    i8 1, label %return
+; X86-NEXT:    i8 -1, label %sw.bb1
+}
+
-- 
2.34.1