From f5575ecd57c4ab8cdae1a80fecc01029d14fe4e6 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Tue, 19 Jan 2016 20:49:25 +0000
Subject: [PATCH] Merging r257875:
 ------------------------------------------------------------------------
 r257875 | jamesm | 2016-01-15 01:20:19 -0800 (Fri, 15 Jan 2016) | 11 lines

[InstCombine] Rewrite bswap/bitreverse handling completely.

There are several requirements that ended up with this design;
  1. Matching bitreversals is too heavyweight for InstCombine and doesn't really need to be done so early.
  2. Bitreversals and byteswaps are very related in their matching logic.
  3. We want to implement support for matching more advanced bswap/bitreverse patterns like partial bswaps/bitreverses.
  4. Bswaps are best matched early in InstCombine.

The result of these is that a new utility function is created in Transforms/Utils/Local.h that can be configured to search for bswaps, bitreverses or both. InstCombine uses it to find only bswaps, CGP uses it to find only bitreversals.

We can then extend the matching logic in one place only.
------------------------------------------------------------------------


git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258180 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/Local.h         |  19 ++
 lib/CodeGen/CodeGenPrepare.cpp                |  29 +++
 .../InstCombine/InstCombineAndOrXor.cpp       | 187 +---------------
 lib/Transforms/Utils/Local.cpp                | 202 ++++++++++++++++++
 .../ARM/bitreverse-recognize.ll               |  37 ++++
 .../CodeGenPrepare/ARM/lit.local.cfg          |   3 +
 .../CodeGenPrepare/bitreverse-hang.ll         |  53 +++++
 .../Transforms/InstCombine/bitreverse-hang.ll |  53 +++++
 .../InstCombine/bitreverse-recognize.ll       | 114 ----------
 9 files changed, 404 insertions(+), 293 deletions(-)
 create mode 100644 test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
 create mode 100644 test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
 create mode 100644 test/Transforms/CodeGenPrepare/bitreverse-hang.ll
 create mode 100644 test/Transforms/InstCombine/bitreverse-hang.ll
 delete mode 100644 test/Transforms/InstCombine/bitreverse-recognize.ll
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 911c6f14da0..3ae01657a2e 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -331,6 +331,25 @@ unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT,
 /// during lowering by the GC infrastructure.
 bool callsGCLeafFunction(ImmutableCallSite CS);
 
+//===----------------------------------------------------------------------===//
+//  Intrinsic pattern matching
+//
+
+/// Try and match a bitreverse or bswap idiom.
+///
+/// If an idiom is matched, an intrinsic call is inserted before \c I. Any added
+/// instructions are returned in \c InsertedInsts. They will all have been added
+/// to a basic block.
+///
+/// A bitreverse idiom normally requires around 2*BW nodes to be searched (where
+/// BW is the bitwidth of the integer type). A bswap idiom requires anywhere up
+/// to BW / 4 nodes to be searched, so is significantly faster.
+///
+/// This function returns true on a successful match or false otherwise.
+bool recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index fd4ee464337..a7cfd580c16 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -5211,6 +5211,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   return false;
 }
 
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+static bool makeBitReverse(Instruction &I, const DataLayout &DL,
+                           const TargetLowering &TLI) {
+  if (!I.getType()->isIntegerTy() ||
+      !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
+                                    TLI.getValueType(DL, I.getType(), true)))
+    return false;
+
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts))
+    return false;
+  Instruction *LastInst = Insts.back();
+  I.replaceAllUsesWith(LastInst);
+  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  return true;
+}
+
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
@@ -5226,6 +5244,17 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
   }
   MadeChange |= dupRetToEnableTailCallOpts(&BB);
 
+  bool MadeBitReverse = true;
+  while (TLI && MadeBitReverse) {
+    MadeBitReverse = false;
+    for (auto &I : reverse(BB)) {
+      if (makeBitReverse(I, *DL, *TLI)) {
+        MadeBitReverse = MadeChange = true;
+        break;
+      }
+    }
+  }
+  
   return MadeChange;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 95c50d32c82..76cefd97cd8 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -1565,190 +1566,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-
-/// Analyze the specified subexpression and see if it is capable of providing
-/// pieces of a bswap or bitreverse. The subexpression provides a potential
-/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
-/// the output of the expression came from a corresponding bit in some other
-/// value. This function is recursive, and the end result is a mapping of
-/// (value, bitnumber) to bitnumber. It is the caller's responsibility to
-/// validate that all `value`s are identical and that the bitnumber to bitnumber
-/// mapping is correct for a bswap or bitreverse.
-///
-/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
-/// that the expression deposits the low byte of %X into the high byte of the
-/// result and that all other bits are zero. This expression is accepted,
-/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7].
-///
-/// This function returns true if the match was unsuccessful and false if so.
-/// On entry to the function the "OverallLeftShift" is a signed integer value
-/// indicating the number of bits that the subexpression is later shifted.  For
-/// example, if the expression is later right shifted by 16 bits, the
-/// OverallLeftShift value would be -16 on entry.  This is used to specify which
-/// bits of BitValues are actually being set.
-///
-/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding
-/// bit is masked to zero by a user.  For example, in (X & 255), X will be
-/// processed with a bytemask of 255. BitMask is always in the local
-/// (OverallLeftShift) coordinate space.
-///
-static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask,
-                            SmallVectorImpl<Value *> &BitValues,
-                            SmallVectorImpl<int> &BitProvenance) {
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or)
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance) ||
-             CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-
-    // If this is a logical shift by a constant, recurse with OverallLeftShift
-    // and BitMask adjusted.
-    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned ShAmt =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
-      // Ensure the shift amount is defined.
-      if (ShAmt > BitValues.size())
-        return true;
-
-      unsigned BitShift = ShAmt;
-      if (I->getOpcode() == Instruction::Shl) {
-        // X << C -> collect(X, +C)
-        OverallLeftShift += BitShift;
-        BitMask = BitMask.lshr(BitShift);
-      } else {
-        // X >>u C -> collect(X, -C)
-        OverallLeftShift -= BitShift;
-        BitMask = BitMask.shl(BitShift);
-      }
-
-      if (OverallLeftShift >= (int)BitValues.size())
-        return true;
-      if (OverallLeftShift <= -(int)BitValues.size())
-        return true;
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-
-    // If this is a logical 'and' with a mask that clears bits, clear the
-    // corresponding bits in BitMask.
-    if (I->getOpcode() == Instruction::And &&
-        isa<ConstantInt>(I->getOperand(1))) {
-      unsigned NumBits = BitValues.size();
-      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
-      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
-
-      for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) {
-        // If this bit is masked out by a later operation, we don't care what
-        // the and mask is.
-        if (BitMask[i] == 0)
-          continue;
-
-        // If the AndMask is zero for this bit, clear the bit.
-        APInt MaskB = AndMask & Bit;
-        if (MaskB == 0) {
-          BitMask.clearBit(i);
-          continue;
-        }
-
-        // Otherwise, this bit is kept.
-      }
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-  }
-
-  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
-  // the input value to the bswap/bitreverse. To be part of a bswap or
-  // bitreverse we must be demanding a contiguous range of bits from it.
-  unsigned InputBitLen = BitMask.countPopulation();
-  unsigned InputBitNo = BitMask.countTrailingZeros();
-  if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo !=
-      InputBitLen)
-    // Not a contiguous set range of bits!
-    return true;
-
-  // We know we're moving a contiguous range of bits from the input to the
-  // output. Record which bits in the output came from which bits in the input.
-  unsigned DestBitNo = InputBitNo + OverallLeftShift;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitProvenance[DestBitNo + I] = InputBitNo + I;
-
-  // If the destination bit value is already defined, the values are or'd
-  // together, which isn't a bswap/bitreverse (unless it's an or of the same
-  // bits).
-  if (BitValues[DestBitNo] && BitValues[DestBitNo] != V)
-    return true;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitValues[DestBitNo + I] = V;
-
-  return false;
-}
-
-static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
-                                          unsigned BitWidth) {
-  if (From % 8 != To % 8)
-    return false;
-  // Convert from bit indices to byte indices and check for a byte reversal.
-  From >>= 3;
-  To >>= 3;
-  BitWidth >>= 3;
-  return From == BitWidth - To - 1;
-}
-
-static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
-                                               unsigned BitWidth) {
-  return From == BitWidth - To - 1;
-}
-
 /// Given an OR instruction, check to see if this is a bswap or bitreverse
 /// idiom. If so, insert the new intrinsic and return it.
 Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
-  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-  if (!ITy)
-    return nullptr;   // Can't do vectors.
-  unsigned BW = ITy->getBitWidth();
-  
-  /// We keep track of which bit (BitProvenance) inside which value (BitValues)
-  /// defines each bit in the result.
-  SmallVector<Value *, 8> BitValues(BW, nullptr);
-  SmallVector<int, 8> BitProvenance(BW, -1);
-  
-  // Try to find all the pieces corresponding to the bswap.
-  APInt BitMask = APInt::getAllOnesValue(BitValues.size());
-  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance))
-    return nullptr;
-
-  // Check to see if all of the bits come from the same value.
-  Value *V = BitValues[0];
-  if (!V) return nullptr;  // Didn't find a bit?  Must be zero.
-
-  if (!std::all_of(BitValues.begin(), BitValues.end(),
-                   [&](const Value *X) { return X == V; }))
-    return nullptr;
-
-  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
-  // only byteswap values with an even number of bytes.
-  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;;
-  for (unsigned i = 0, e = BitValues.size(); i != e; ++i) {
-    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
-    OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
-  }
-
-  Intrinsic::ID Intrin;
-  if (OKForBSwap)
-    Intrin = Intrinsic::bswap;
-  else if (OKForBitReverse)
-    Intrin = Intrinsic::bitreverse;
-  else
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts))
     return nullptr;
+  Instruction *LastInst = Insts.pop_back_val();
+  LastInst->removeFromParent();
 
-  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy);
-  return CallInst::Create(F, V);
+  for (auto *Inst : Insts)
+    Worklist.Add(Inst);
+  return LastInst;
 }
 
 /// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0)
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 1cb65fccb9d..abc9b65f7a3 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1592,3 +1592,205 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
 
   return false;
 }
+
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+  BitPart(Value *P, unsigned BW) : Provider(P) {
+    Provenance.resize(BW);
+  }
+
+  /// The Value that this is a bitreverse/bswap of.
+  Value *Provider;
+  /// The "provenance" of each bit. Provenance[A] = B means that bit A
+  /// in Provider becomes bit B in the result of this expression.
+  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+  enum { Unset = -1 };
+};
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
+/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+///
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+                std::map<Value *, Optional<BitPart>> &BPS) {
+  auto I = BPS.find(V);
+  if (I != BPS.end())
+    return I->second;
+
+  auto &Result = BPS[V] = None;
+  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      if (!A || !B)
+        return Result;
+
+      // Try and merge the two together.
+      if (!A->Provider || A->Provider != B->Provider)
+        return Result;
+
+      Result = BitPart(A->Provider, BitWidth);
+      for (unsigned i = 0; i < A->Provenance.size(); ++i) {
+        if (A->Provenance[i] != BitPart::Unset &&
+            B->Provenance[i] != BitPart::Unset &&
+            A->Provenance[i] != B->Provenance[i])
+          return Result = None;
+
+        if (A->Provenance[i] == BitPart::Unset)
+          Result->Provenance[i] = B->Provenance[i];
+        else
+          Result->Provenance[i] = A->Provenance[i];
+      }
+
+      return Result;
+    }
+
+    // If this is a logical shift by a constant, recurse then shift the result.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned BitShift =
+          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined.
+      if (BitShift > BitWidth)
+        return Result;
+
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      // Perform the "shift" on BitProvenance.
+      auto &P = Result->Provenance;
+      if (I->getOpcode() == Instruction::Shl) {
+        P.erase(std::prev(P.end(), BitShift), P.end());
+        P.insert(P.begin(), BitShift, BitPart::Unset);
+      } else {
+        P.erase(P.begin(), std::next(P.begin(), BitShift));
+        P.insert(P.end(), BitShift, BitPart::Unset);
+      }
+
+      return Result;
+    }
+
+    // If this is a logical 'and' with a mask that clears bits, recurse then
+    // unset the appropriate bits.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+
+      // Check that the mask allows a multiple of 8 bits for a bswap, for an
+      // early exit.
+      unsigned NumMaskedBits = AndMask.countPopulation();
+      if (!MatchBitReversals && NumMaskedBits % 8 != 0)
+        return Result;
+      
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1)
+        // If the AndMask is zero for this bit, clear the bit.
+        if ((AndMask & Bit) == 0)
+          Result->Provenance[i] = BitPart::Unset;
+
+      return Result;
+    }
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap/bitreverse.
+  Result = BitPart(V, BitWidth);
+  for (unsigned i = 0; i < BitWidth; ++i)
+    Result->Provenance[i] = i;
+  return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+bool llvm::recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts) {
+  if (Operator::getOpcode(I) != Instruction::Or)
+    return false;
+  if (!MatchBSwaps && !MatchBitReversals)
+    return false;
+  IntegerType *ITy = dyn_cast<IntegerType>(I->getType());
+  if (!ITy || ITy->getBitWidth() > 128)
+    return false;   // Can't do vectors or integers > 128 bits.
+  unsigned BW = ITy->getBitWidth();
+
+  // Try to find all the pieces corresponding to the bswap.
+  std::map<Value *, Optional<BitPart>> BPS;
+  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
+  if (!Res)
+    return false;
+  auto &BitProvenance = Res->Provenance;
+
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
+  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < BW; ++i) {
+    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+    OKForBitReverse &=
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+  }
+
+  Intrinsic::ID Intrin;
+  if (OKForBSwap && MatchBSwaps)
+    Intrin = Intrinsic::bswap;
+  else if (OKForBitReverse && MatchBitReversals)
+    Intrin = Intrinsic::bitreverse;
+  else
+    return false;
+
+  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
+  InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
+  return true;
+}
diff --git a/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
new file mode 100644
index 00000000000..36440da2162
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -loop-unroll -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; CHECK-LABEL: @f
+define i32 @f(i32 %a) #0 {
+; CHECK: call i32 @llvm.bitreverse.i32
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %b.07 = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  %shr = lshr i32 %a, %i.08
+  %and = and i32 %shr, 1
+  %sub = sub nuw nsw i32 31, %i.08
+  %shl = shl i32 %and, %sub
+  %or = or i32 %shl, %b.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
+}
+
+attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git b7441a0f42c43a8eea9e3e706be187252db747fa)"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.unroll.full"}
diff --git a/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
new file mode 100644
index 00000000000..98c6700c209
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/CodeGenPrepare/bitreverse-hang.ll b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll
new file mode 100644
index 00000000000..c81dcc15cae
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -codegenprepare -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/InstCombine/bitreverse-hang.ll b/test/Transforms/InstCombine/bitreverse-hang.ll
new file mode 100644
index 00000000000..6823bd0ed65
--- /dev/null
+++ b/test/Transforms/InstCombine/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -instcombine -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/InstCombine/bitreverse-recognize.ll b/test/Transforms/InstCombine/bitreverse-recognize.ll
deleted file mode 100644
index fbd5cb6d139..00000000000
--- a/test/Transforms/InstCombine/bitreverse-recognize.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define zeroext i8 @f_u8(i8 zeroext %a) {
-; CHECK-LABEL: @f_u8
-; CHECK-NEXT: %[[A:.*]] = call i8 @llvm.bitreverse.i8(i8 %a)
-; CHECK-NEXT: ret i8 %[[A]]
-  %1 = shl i8 %a, 7
-  %2 = shl i8 %a, 5
-  %3 = and i8 %2, 64
-  %4 = shl i8 %a, 3
-  %5 = and i8 %4, 32
-  %6 = shl i8 %a, 1
-  %7 = and i8 %6, 16
-  %8 = lshr i8 %a, 1
-  %9 = and i8 %8, 8
-  %10 = lshr i8 %a, 3
-  %11 = and i8 %10, 4
-  %12 = lshr i8 %a, 5
-  %13 = and i8 %12, 2
-  %14 = lshr i8 %a, 7
-  %15 = or i8 %14, %1
-  %16 = or i8 %15, %3
-  %17 = or i8 %16, %5
-  %18 = or i8 %17, %7
-  %19 = or i8 %18, %9
-  %20 = or i8 %19, %11
-  %21 = or i8 %20, %13
-  ret i8 %21
-}
-
-; The ANDs with 32 and 64 have been swapped here, so the sequence does not
-; completely match a bitreverse.
-define zeroext i8 @f_u8_fail(i8 zeroext %a) {
-; CHECK-LABEL: @f_u8_fail
-; CHECK-NOT: call
-; CHECK: ret i8
-  %1 = shl i8 %a, 7
-  %2 = shl i8 %a, 5
-  %3 = and i8 %2, 32
-  %4 = shl i8 %a, 3
-  %5 = and i8 %4, 64
-  %6 = shl i8 %a, 1
-  %7 = and i8 %6, 16
-  %8 = lshr i8 %a, 1
-  %9 = and i8 %8, 8
-  %10 = lshr i8 %a, 3
-  %11 = and i8 %10, 4
-  %12 = lshr i8 %a, 5
-  %13 = and i8 %12, 2
-  %14 = lshr i8 %a, 7
-  %15 = or i8 %14, %1
-  %16 = or i8 %15, %3
-  %17 = or i8 %16, %5
-  %18 = or i8 %17, %7
-  %19 = or i8 %18, %9
-  %20 = or i8 %19, %11
-  %21 = or i8 %20, %13
-  ret i8 %21
-}
-
-define zeroext i16 @f_u16(i16 zeroext %a) {
-; CHECK-LABEL: @f_u16
-; CHECK-NEXT: %[[A:.*]] = call i16 @llvm.bitreverse.i16(i16 %a)
-; CHECK-NEXT: ret i16 %[[A]]
-  %1 = shl i16 %a, 15
-  %2 = shl i16 %a, 13
-  %3 = and i16 %2, 16384
-  %4 = shl i16 %a, 11
-  %5 = and i16 %4, 8192
-  %6 = shl i16 %a, 9
-  %7 = and i16 %6, 4096
-  %8 = shl i16 %a, 7
-  %9 = and i16 %8, 2048
-  %10 = shl i16 %a, 5
-  %11 = and i16 %10, 1024
-  %12 = shl i16 %a, 3
-  %13 = and i16 %12, 512
-  %14 = shl i16 %a, 1
-  %15 = and i16 %14, 256
-  %16 = lshr i16 %a, 1
-  %17 = and i16 %16, 128
-  %18 = lshr i16 %a, 3
-  %19 = and i16 %18, 64
-  %20 = lshr i16 %a, 5
-  %21 = and i16 %20, 32
-  %22 = lshr i16 %a, 7
-  %23 = and i16 %22, 16
-  %24 = lshr i16 %a, 9
-  %25 = and i16 %24, 8
-  %26 = lshr i16 %a, 11
-  %27 = and i16 %26, 4
-  %28 = lshr i16 %a, 13
-  %29 = and i16 %28, 2
-  %30 = lshr i16 %a, 15
-  %31 = or i16 %30, %1
-  %32 = or i16 %31, %3
-  %33 = or i16 %32, %5
-  %34 = or i16 %33, %7
-  %35 = or i16 %34, %9
-  %36 = or i16 %35, %11
-  %37 = or i16 %36, %13
-  %38 = or i16 %37, %15
-  %39 = or i16 %38, %17
-  %40 = or i16 %39, %19
-  %41 = or i16 %40, %21
-  %42 = or i16 %41, %23
-  %43 = or i16 %42, %25
-  %44 = or i16 %43, %27
-  %45 = or i16 %44, %29
-  ret i16 %45
-}
\ No newline at end of file
-- 
2.34.1