From: Yi Jiang <yjiang@apple.com>
Date: Mon, 21 Apr 2014 19:34:27 +0000 (+0000)
Subject: ARM64: Combine shifts and uses from different basic block to bit-extract instruction
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=5d473a08317ce9f150701c1a3cdb2e3df10922fc;p=oota-llvm.git

ARM64: Combine shifts and uses from different basic block to bit-extract instruction

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206774 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 2e4956f16d0..c17416dfee9 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -182,6 +182,9 @@ public:
     return HasMultipleConditionRegisters;
   }
 
+  /// Return true if the target has BitExtract instructions.
+  bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
+
   /// Return true if a vector of the given type should be split
   /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type
   /// legalization.
@@ -1010,6 +1013,14 @@ protected:
     HasMultipleConditionRegisters = hasManyRegs;
   }
 
+  /// Tells the code generator that the target has BitExtract instructions.
+  /// The code generator will aggressively sink "shift"s into the blocks of
+  /// their users if the users will generate "and" instructions which can be
+  /// combined with "shift" to BitExtract instructions.
+  void setHasExtractBitsInsn(bool hasExtractInsn = true) {
+    HasExtractBitsInsn = hasExtractInsn;
+  }
+
   /// Tells the code generator not to expand sequence of operations into a
   /// separate sequences that increases the amount of flow control.
   void setJumpIsExpensive(bool isExpensive = true) {
@@ -1436,6 +1447,12 @@ private:
   /// the blocks of their users.
   bool HasMultipleConditionRegisters;
 
+  /// Tells the code generator that the target has BitExtract instructions.
+  /// The code generator will aggressively sink "shift"s into the blocks of
+  /// their users if the users will generate "and" instructions which can be
+  /// combined with "shift" to BitExtract instructions.
+  bool HasExtractBitsInsn;
+
   /// Tells the code generator not to expand integer divides by constants into a
   /// sequence of muls, adds, and shifts.  This is a hack until a real cost
   /// model is in place.  If we ever optimize for size, this will be set to true
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 009e153e0e8..27fdd5187bd 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -628,6 +628,187 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
   return MadeChange;
 }
 
+/// isExtractBitsCandidateUse - Check if the candidates could
+/// be combined with shift instruction, which includes:
+/// 1. Truncate instruction
+/// 2. And instruction and the imm is a mask of the low bits:
+/// imm & (imm+1) == 0
+bool isExtractBitsCandidateUse(Instruction *User) {
+  if (!isa<TruncInst>(User)) {
+    if (User->getOpcode() != Instruction::And ||
+        !isa<ConstantInt>(User->getOperand(1)))
+      return false;
+
+    unsigned Cimm = dyn_cast<ConstantInt>(User->getOperand(1))->getZExtValue();
+
+    if (Cimm & (Cimm + 1))
+      return false;
+  }
+  return true;
+}
+
+/// SinkShiftAndTruncate - sink both shift and truncate instruction
+/// to the use of truncate's BB.
+bool
+SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
+                     DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
+                     const TargetLowering &TLI) {
+  BasicBlock *UserBB = User->getParent();
+  DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
+  TruncInst *TruncI = dyn_cast<TruncInst>(User);
+  bool MadeChange = false;
+
+  for (Value::user_iterator TruncUI = TruncI->user_begin(),
+                            TruncE = TruncI->user_end();
+       TruncUI != TruncE;) {
+
+    Use &TruncTheUse = TruncUI.getUse();
+    Instruction *TruncUser = cast<Instruction>(*TruncUI);
+    // Preincrement use iterator so we don't invalidate it.
+
+    ++TruncUI;
+
+    int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
+    if (!ISDOpcode)
+      continue;
+
+    // If the use is actually a legal node, there will not be an implicit
+    // truncate.
+    if (TLI.isOperationLegalOrCustom(ISDOpcode,
+                                     EVT::getEVT(TruncUser->getType())))
+      continue;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(TruncUser))
+      continue;
+
+    BasicBlock *TruncUserBB = TruncUser->getParent();
+
+    if (UserBB == TruncUserBB)
+      continue;
+
+    BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
+    CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
+
+    if (!InsertedShift && !InsertedTrunc) {
+      BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
+      // Sink the shift
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift =
+            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
+      else
+        InsertedShift =
+            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
+
+      // Sink the trunc
+      BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
+      TruncInsertPt++;
+
+      InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
+                                       TruncI->getType(), "", TruncInsertPt);
+
+      MadeChange = true;
+
+      TruncTheUse = InsertedTrunc;
+    }
+  }
+  return MadeChange;
+}
+
+/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if
+/// the uses could potentially be combined with this shift instruction and
+/// generate BitExtract instruction. It will only be applied if the architecture
+/// supports BitExtract instruction. Here is an example:
+/// BB1:
+///   %x.extract.shift = lshr i64 %arg1, 32
+/// BB2:
+///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
+/// ==>
+///
+/// BB2:
+///   %x.extract.shift.1 = lshr i64 %arg1, 32
+///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
+///
+/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
+/// instruction.
+/// Return true if any changes are made.
+static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
+                                const TargetLowering &TLI) {
+  BasicBlock *DefBB = ShiftI->getParent();
+
+  /// Only insert instructions in each block once.
+  DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
+
+  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    if (!isExtractBitsCandidateUse(User))
+      continue;
+
+    BasicBlock *UserBB = User->getParent();
+
+    if (UserBB == DefBB) {
+      // If the shift and truncate instruction are in the same BB. The use of
+      // the truncate(TruncUse) may still introduce another truncate if not
+      // legal. In this case, we would like to sink both shift and truncate
+      // instruction to the BB of TruncUse.
+      // for example:
+      // BB1:
+      // i64 shift.result = lshr i64 opnd, imm
+      // trunc.result = trunc shift.result to i16
+      //
+      // BB2:
+      //   ----> We will have an implicit truncate here if the architecture does
+      //   not have i16 compare.
+      // cmp i16 trunc.result, opnd2
+      //
+      if (isa<TruncInst>(User) && shiftIsLegal
+          // If the type of the truncate is legal, no trucate will be
+          // introduced in other basic blocks.
+          && (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
+        MadeChange =
+            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
+
+      continue;
+    }
+    // If we have already inserted a shift into this block, use it.
+    BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
+
+    if (!InsertedShift) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift =
+            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
+      else
+        InsertedShift =
+            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
+
+      MadeChange = true;
+    }
+
+    // Replace a use of the shift with a use of the new shift.
+    TheUse = InsertedShift;
+  }
+
+  // If we removed all uses, nuke the shift.
+  if (ShiftI->use_empty())
+    ShiftI->eraseFromParent();
+
+  return MadeChange;
+}
+
 namespace {
 class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
 protected:
@@ -3225,6 +3406,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     return false;
   }
 
+  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
+
+  if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
+                BinOp->getOpcode() == Instruction::LShr)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (TLI && CI && TLI->hasExtractBitsInsn())
+      return OptimizeExtractBits(BinOp, CI, *TLI);
+
+    return false;
+  }
+
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
     if (GEPI->hasAllZeroIndices()) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
index 8ef13e781b9..f96249937ec 100644
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
@@ -1183,6 +1183,14 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
     // Make sure to clamp the MSB so that we preserve the semantics of the
     // original operations.
     ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   Srl_imm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
   } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
     Opd0 = Op0->getOperand(0);
   } else if (BiggerPattern) {
@@ -1277,8 +1285,19 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 
   // we're looking for a shift of a shift
   uint64_t Shl_imm = 0;
+  uint64_t Trunc_bits = 0;
   if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
     Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
   } else if (BiggerPattern) {
     // Let's pretend a 0 shift left has been performed.
     // FIXME: Currently we limit this to the bigger pattern case,
@@ -1295,7 +1314,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
          "bad amount in shift node!");
   // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
+  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
   int sLSB = Srl_imm - Shl_imm;
   if (sLSB < 0)
     return false;
@@ -1354,8 +1373,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
     return NULL;
 
   EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
+    MachineSDNode *Node =
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+                               SDValue(BFM, 0), SubReg);
+    return Node;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+                   CurDAG->getTargetConstant(MSB, VT)};
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
 }
 
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
index 7fee5646d1c..ee498e65c71 100644
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ b/lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -438,6 +438,8 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
   setDivIsWellDefined(true);
 
   RequireStrictAlign = StrictAlign;
+
+  setHasExtractBitsInsn(true);
 }
 
 void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/ARM64/bitfield-extract.ll
index 40dee710aa9..4a62cfaf7fc 100644
--- a/test/CodeGen/ARM64/bitfield-extract.ll
+++ b/test/CodeGen/ARM64/bitfield-extract.ll
@@ -1,3 +1,4 @@
+; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
 ; RUN: llc < %s -march=arm64 | FileCheck %s
 %struct.X = type { i8, i8, [2 x i8] }
 %struct.Y = type { i32, i8 }
@@ -404,3 +405,75 @@ define i64 @fct18(i32 %xor72) nounwind ssp {
   %result = and i64 %conv82, 255
   ret i64 %result
 }
+
+; Using the access to the global array to keep the instruction and control flow.
+@first_ones = external global [65536 x i8]
+
+; Function Attrs: nounwind readonly ssp
+define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
+; CHECK-LABEL: fct19:
+entry:
+  %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
+  %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
+  %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
+  %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
+  %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
+  %0 = load i8* %arrayidx3, align 1
+  %conv = zext i8 %0 to i32
+  br label %return
+
+; OPT-LABEL: if.end
+if.end:                                           ; preds = %entry
+; OPT: lshr
+; CHECK: ubfm	[[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #47
+  %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
+  %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool6, label %if.end13, label %if.then7
+
+; OPT-LABEL: if.then7
+if.then7:                                         ; preds = %if.end
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
+  %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
+  %1 = load i8* %arrayidx11, align 1
+  %conv12 = zext i8 %1 to i32
+  %add = add nsw i32 %conv12, 16
+  br label %return
+
+; OPT-LABEL: if.end13
+if.end13:                                         ; preds = %if.end
+; OPT: lshr
+; OPT: trunc
+; CHECK: ubfm	[[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #31
+  %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool16, label %return, label %if.then17
+
+; OPT-LABEL: if.then17
+if.then17:                                        ; preds = %if.end13
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
+  %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
+  %2 = load i8* %arrayidx21, align 1
+  %conv22 = zext i8 %2 to i32
+  %add23 = add nsw i32 %conv22, 32
+  br label %return
+
+return:                                           ; preds = %if.end13, %if.then17, %if.then7, %if.then
+; CHECK: ret
+  %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
+  ret i32 %retval.0
+}