cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
+cl::opt<bool> UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
+ cl::desc("use aggressive ppc isel for bit permutations"), cl::Hidden);
+cl::opt<bool> BPermRewriterNoMasking("ppc-bit-perm-rewriter-stress-rotates",
+ cl::desc("stress rotate selection in aggressive ppc isel for "
+ "bit permutations"), cl::Hidden);
+
namespace llvm {
void initializePPCDAGToDAGISelPass(PassRegistry&);
}
return nullptr;
}
+// Predict the number of instructions that would be generated by calling
+// SelectInt64(N).
+static unsigned SelectInt64Count(int64_t Imm) {
+ // Assume no remaining bits.
+ unsigned Remainder = 0;
+ // Assume no shift required.
+ unsigned Shift = 0;
+
+ // If it can't be represented as a 32 bit value.
+ if (!isInt<32>(Imm)) {
+ Shift = countTrailingZeros<uint64_t>(Imm);
+ int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+ // If the shifted value fits 32 bits.
+ if (isInt<32>(ImmSh)) {
+ // Go with the shifted value.
+ Imm = ImmSh;
+ } else {
+ // Still stuck with a 64 bit value.
+ Remainder = Imm;
+ Shift = 32;
+ Imm >>= 32;
+ }
+ }
+
+ // Intermediate operand.
+ unsigned Result = 0;
+
+ // Handle first 32 bits.
+ unsigned Lo = Imm & 0xFFFF;
+ unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+ // Simple value.
+ if (isInt<16>(Imm)) {
+ // Just the Lo bits.
+ ++Result;
+ } else if (Lo) {
+ // Handle the Hi bits and Lo bits.
+ Result += 2;
+ } else {
+ // Just the Hi bits.
+ ++Result;
+ }
+
+ // If no shift, we're done.
+ if (!Shift) return Result;
+
+ // Shift for next step if the upper 32-bits were not zero.
+ if (Imm)
+ ++Result;
+
+ // Add in the last bits as required.
+ if ((Hi = (Remainder >> 16) & 0xFFFF))
+ ++Result;
+ if ((Lo = Remainder & 0xFFFF))
+ ++Result;
+
+ return Result;
+}
+
+// Select a 64-bit constant. For cost-modeling purposes, SelectInt64Count
+// (above) needs to be kept in sync with this function.
+static SDNode *SelectInt64(SelectionDAG *CurDAG, SDLoc dl, int64_t Imm) {
+ // Assume no remaining bits.
+ unsigned Remainder = 0;
+ // Assume no shift required.
+ unsigned Shift = 0;
+
+ // If it can't be represented as a 32 bit value.
+ if (!isInt<32>(Imm)) {
+ Shift = countTrailingZeros<uint64_t>(Imm);
+ int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+ // If the shifted value fits 32 bits.
+ if (isInt<32>(ImmSh)) {
+ // Go with the shifted value.
+ Imm = ImmSh;
+ } else {
+ // Still stuck with a 64 bit value.
+ Remainder = Imm;
+ Shift = 32;
+ Imm >>= 32;
+ }
+ }
+
+ // Intermediate operand.
+ SDNode *Result;
+
+ // Handle first 32 bits.
+ unsigned Lo = Imm & 0xFFFF;
+ unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+ auto getI32Imm = [CurDAG](unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
+ };
+
+ // Simple value.
+ if (isInt<16>(Imm)) {
+ // Just the Lo bits.
+ Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+ } else if (Lo) {
+ // Handle the Hi bits.
+ unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+ Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+ // And Lo bits.
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Lo));
+ } else {
+ // Just the Hi bits.
+ Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+ }
+
+ // If no shift, we're done.
+ if (!Shift) return Result;
+
+ // Shift for next step if the upper 32-bits were not zero.
+ if (Imm) {
+ Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+ SDValue(Result, 0),
+ getI32Imm(Shift),
+ getI32Imm(63 - Shift));
+ }
+
+ // Add in the last bits as required.
+ if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+ Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Hi));
+ }
+ if ((Lo = Remainder & 0xFFFF)) {
+ Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+ SDValue(Result, 0), getI32Imm(Lo));
+ }
+
+ return Result;
+}
+
+// Select a 64-bit constant.
+static SDNode *SelectInt64(SelectionDAG *CurDAG, SDNode *N) {
+ SDLoc dl(N);
+
+ // Get 64 bit value.
+ int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+ return SelectInt64(CurDAG, dl, Imm);
+}
+
+
namespace {
class BitPermutationSelector {
struct ValueBit {
unsigned RLAmt;
unsigned StartIdx, EndIdx;
+ // This rotation amount assumes that the lower 32 bits of the quantity are
+ // replicated in the high 32 bits by the rotation operator (which is done
+ // by rlwinm and friends in 64-bit mode).
+ bool Repl32;
+ // Did converting to Repl32 == true change the rotation factor? If it did,
+ // it decreased it by 32.
+ bool Repl32CR;
+ // Was this group coalesced after setting Repl32 to true?
+ bool Repl32Coalesced;
+
BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
- : V(V), RLAmt(R), StartIdx(S), EndIdx(E) {
+ : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
+ Repl32Coalesced(false) {
DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
" [" << S << ", " << E << "]\n");
}
unsigned RLAmt;
unsigned NumGroups;
unsigned FirstGroupStartIdx;
+ bool Repl32;
ValueRotInfo()
- : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX) {}
+ : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
+ Repl32(false) {}
// For sorting (in reverse order) by NumGroups, and then by
// FirstGroupStartIdx.
bool operator < (const ValueRotInfo &Other) const {
- if (NumGroups > Other.NumGroups)
+ // We need to sort so that the non-Repl32 come first because, when we're
+ // doing masking, the Repl32 bit groups might be subsumed into the 64-bit
+ // masking operation.
+ if (Repl32 < Other.Repl32)
+ return true;
+ else if (Repl32 > Other.Repl32)
+ return false;
+ else if (NumGroups > Other.NumGroups)
return true;
else if (NumGroups < Other.NumGroups)
return false;
}
// Collect groups of consecutive bits with the same underlying value and
- // rotation factor.
- void collectBitGroups() {
+ // rotation factor. If we're doing late masking, we ignore zeros, otherwise
+ // they break up groups.
+ void collectBitGroups(bool LateMask) {
BitGroups.clear();
unsigned LastRLAmt = RLAmt[0];
for (unsigned i = 1; i < Bits.size(); ++i) {
unsigned ThisRLAmt = RLAmt[i];
SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
+ if (LateMask && !ThisValue) {
+ ThisValue = LastValue;
+ ThisRLAmt = LastRLAmt;
+ // If we're doing late masking, then the first bit group always starts
+ // at zero (even if the first bits were zero).
+ if (BitGroups.empty())
+ LastGroupStartIdx = 0;
+ }
// If this bit has the same underlying value and the same rotate factor as
// the last one, then they're part of the same group.
BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
+ DEBUG(dbgs() << "\tcombining final bit group with inital one\n");
BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
BitGroups.erase(BitGroups.begin());
}
ValueRots.clear();
for (auto &BG : BitGroups) {
- ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, BG.RLAmt)];
+ unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0);
+ ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)];
VRI.V = BG.V;
VRI.RLAmt = BG.RLAmt;
+ VRI.Repl32 = BG.Repl32;
VRI.NumGroups += 1;
VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
}
std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
}
+ // In 64-bit mode, rlwinm and friends have a rotation operator that
+ // replicates the low-order 32 bits into the high-order 32-bits. The mask
+ // indices of these instructions can only be in the lower 32 bits, so they
+ // can only represent some 64-bit bit groups. However, when they can be used,
+ // the 32-bit replication can be used to represent, as a single bit group,
+ // otherwise separate bit groups. We'll convert to replicated-32-bit bit
+ // groups when possible. Returns true if any of the bit groups were
+ // converted.
+ void assignRepl32BitGroups() {
+ // If we have bits like this:
+ //
+ // Indices: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ // V bits: ... 7 6 5 4 3 2 1 0 31 30 29 28 27 26 25 24
+ // Groups: | RLAmt = 8 | RLAmt = 40 |
+ //
+ // But, making use of a 32-bit operation that replicates the low-order 32
+ // bits into the high-order 32 bits, this can be one bit group with a RLAmt
+ // of 8.
+
+ auto IsAllLow32 = [this](BitGroup & BG) {
+ if (BG.StartIdx <= BG.EndIdx) {
+ for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) {
+ if (!Bits[i].hasValue())
+ continue;
+ if (Bits[i].getValueBitIndex() >= 32)
+ return false;
+ }
+ } else {
+ for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) {
+ if (!Bits[i].hasValue())
+ continue;
+ if (Bits[i].getValueBitIndex() >= 32)
+ return false;
+ }
+ for (unsigned i = 0; i <= BG.EndIdx; ++i) {
+ if (!Bits[i].hasValue())
+ continue;
+ if (Bits[i].getValueBitIndex() >= 32)
+ return false;
+ }
+ }
+
+ return true;
+ };
+
+ for (auto &BG : BitGroups) {
+ if (BG.StartIdx < 32 && BG.EndIdx < 32) {
+ if (IsAllLow32(BG)) {
+ if (BG.RLAmt >= 32) {
+ BG.RLAmt -= 32;
+ BG.Repl32CR = true;
+ }
+
+ BG.Repl32 = true;
+
+ DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
+ BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
+ " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+ }
+ }
+ }
+
+ // Now walk through the bit groups, consolidating where possible.
+ for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+ // We might want to remove this bit group by merging it with the previous
+ // group (which might be the ending group).
+ auto IP = (I == BitGroups.begin()) ?
+ std::prev(BitGroups.end()) : std::prev(I);
+ if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
+ I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
+
+ DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
+ I->V.getNode() << " RLAmt = " << I->RLAmt <<
+ " [" << I->StartIdx << ", " << I->EndIdx <<
+ "] with group with range [" <<
+ IP->StartIdx << ", " << IP->EndIdx << "]\n");
+
+ IP->EndIdx = I->EndIdx;
+ IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
+ IP->Repl32Coalesced = true;
+ I = BitGroups.erase(I);
+ continue;
+ } else {
+ // There is a special case worth handling: If there is a single group
+ // covering the entire upper 32 bits, and it can be merged with both
+ // the next and previous groups (which might be the same group), then
+ // do so. If it is the same group (so there will be only one group in
+ // total), then we need to reverse the order of the range so that it
+ // covers the entire 64 bits.
+ if (I->StartIdx == 32 && I->EndIdx == 63) {
+ assert(std::next(I) == BitGroups.end() &&
+ "bit group ends at index 63 but there is another?");
+ auto IN = BitGroups.begin();
+
+ if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V &&
+ (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
+ IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
+ IsAllLow32(*I)) {
+
+ DEBUG(dbgs() << "\tcombining bit group for " <<
+ I->V.getNode() << " RLAmt = " << I->RLAmt <<
+ " [" << I->StartIdx << ", " << I->EndIdx <<
+ "] with 32-bit replicated groups with ranges [" <<
+ IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
+ IN->StartIdx << ", " << IN->EndIdx << "]\n");
+
+ if (IP == IN) {
+ // There is only one other group; change it to cover the whole
+ // range (backward, so that it can still be Repl32 but cover the
+ // whole 64-bit range).
+ IP->StartIdx = 31;
+ IP->EndIdx = 30;
+ IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32;
+ IP->Repl32Coalesced = true;
+ I = BitGroups.erase(I);
+ } else {
+ // There are two separate groups, one before this group and one
+ // after us (at the beginning). We're going to remove this group,
+ // but also the group at the very beginning.
+ IP->EndIdx = IN->EndIdx;
+ IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32;
+ IP->Repl32Coalesced = true;
+ I = BitGroups.erase(I);
+ BitGroups.erase(BitGroups.begin());
+ }
+
+ // This must be the last group in the vector (and we might have
+ // just invalidated the iterator above), so break here.
+ break;
+ }
+ }
+ }
+
+ ++I;
+ }
+ }
+
SDValue getI32Imm(unsigned Imm) {
return CurDAG->getTargetConstant(Imm, MVT::i32);
}
+ uint64_t getZerosMask() {
+ uint64_t Mask = 0;
+ for (unsigned i = 0; i < Bits.size(); ++i) {
+ if (Bits[i].hasValue())
+ continue;
+ Mask |= (1ul << i);
+ }
+
+ return ~Mask;
+ }
+
// Depending on the number of groups for a particular value, it might be
// better to rotate, mask explicitly (using andi/andis), and then or the
// result. Select this part of the result first.
- void SelectAndParts32(SDNode *N, SDValue &Res) {
- SDLoc dl(N);
+ void SelectAndParts32(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+ if (BPermRewriterNoMasking)
+ return;
for (ValueRotInfo &VRI : ValueRotsVec) {
unsigned Mask = 0;
(unsigned) (ANDISMask != 0) +
(unsigned) (ANDIMask != 0 && ANDISMask != 0) +
(unsigned) (bool) Res;
+
+ DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+ " RL: " << VRI.RLAmt << ":" <<
+ "\n\t\t\tisel using masking: " << NumAndInsts <<
+ " using rotates: " << VRI.NumGroups << "\n");
+
if (NumAndInsts >= VRI.NumGroups)
continue;
+ DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+ if (InstCnt) *InstCnt += NumAndInsts;
+
SDValue VRot;
if (VRI.RLAmt) {
SDValue Ops[] =
}
// Instruction selection for the 32-bit case.
- SDNode *Select32(SDNode *N) {
+ SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) {
SDLoc dl(N);
SDValue Res;
+ if (InstCnt) *InstCnt = 0;
+
// Take care of cases that should use andi/andis first.
- SelectAndParts32(N, Res);
+ SelectAndParts32(dl, Res, InstCnt);
// If we've not yet selected a 'starting' instruction, and we have no zeros
// to fill in, select the (Value, RLAmt) with the highest priority (largest
// number of groups), and start with this rotated value.
- if (!HasZeros && !Res) {
+ if ((!HasZeros || LateMask) && !Res) {
ValueRotInfo &VRI = ValueRotsVec[0];
if (VRI.RLAmt) {
+ if (InstCnt) *InstCnt += 1;
SDValue Ops[] =
{ VRI.V, getI32Imm(VRI.RLAmt), getI32Imm(0), getI32Imm(31) };
Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
}
}
+ if (InstCnt) *InstCnt += BitGroups.size();
+
// Insert the other groups (one at a time).
for (auto &BG : BitGroups) {
- if (!Res.getNode()) {
+ if (!Res) {
SDValue Ops[] =
{ BG.V, getI32Imm(BG.RLAmt), getI32Imm(Bits.size() - BG.EndIdx - 1),
getI32Imm(Bits.size() - BG.StartIdx - 1) };
}
}
+ if (LateMask) {
+ unsigned Mask = (unsigned) getZerosMask();
+
+ unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in zeros mask?");
+
+ if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+ (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+ Res, getI32Imm(ANDIMask)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+ Res, getI32Imm(ANDISMask)), 0);
+
+ if (!ANDIVal)
+ Res = ANDISVal;
+ else if (!ANDISVal)
+ Res = ANDIVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+ ANDIVal, ANDISVal), 0);
+ }
+
return Res.getNode();
}
+ unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32,
+ unsigned MaskStart, unsigned MaskEnd,
+ bool IsIns) {
+ // In the notation used by the instructions, 'start' and 'end' are reversed
+ // because bits are counted from high to low order.
+ unsigned InstMaskStart = 64 - MaskEnd - 1,
+ InstMaskEnd = 64 - MaskStart - 1;
+
+ if (Repl32)
+ return 1;
+
+ if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) ||
+ InstMaskEnd == 63 - RLAmt)
+ return 1;
+
+ return 2;
+ }
+
+ // For 64-bit values, not all combinations of rotates and masks are
+ // available. Produce one if it is available.
+ SDValue SelectRotMask64(SDValue V, SDLoc dl, unsigned RLAmt, bool Repl32,
+ unsigned MaskStart, unsigned MaskEnd,
+ unsigned *InstCnt = nullptr) {
+ // In the notation used by the instructions, 'start' and 'end' are reversed
+ // because bits are counted from high to low order.
+ unsigned InstMaskStart = 64 - MaskEnd - 1,
+ InstMaskEnd = 64 - MaskStart - 1;
+
+ if (InstCnt) *InstCnt += 1;
+
+ if (Repl32) {
+ // This rotation amount assumes that the lower 32 bits of the quantity
+ // are replicated in the high 32 bits by the rotation operator (which is
+ // done by rlwinm and friends).
+ assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+ assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32),
+ getI32Imm(InstMaskEnd - 32) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
+ Ops), 0);
+ }
+
+ if (InstMaskEnd == 63) {
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
+ }
+
+ if (InstMaskStart == 0) {
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt), getI32Imm(InstMaskEnd) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
+ }
+
+ if (InstMaskEnd == 63 - RLAmt) {
+ SDValue Ops[] =
+ { V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
+ }
+
+ // We cannot do this with a single instruction, so we'll use two. The
+ // problem is that we're not free to choose both a rotation amount and mask
+ // start and end independently. We can choose an arbitrary mask start and
+ // end, but then the rotation amount is fixed. Rotation, however, can be
+ // inverted, and so by applying an "inverse" rotation first, we can get the
+ // desired result.
+ if (InstCnt) *InstCnt += 1;
+
+ // The rotation mask for the second instruction must be MaskStart.
+ unsigned RLAmt2 = MaskStart;
+ // The first instruction must rotate V so that the overall rotation amount
+ // is RLAmt.
+ unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+ if (RLAmt1)
+ V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+ return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd);
+ }
+
+ // For 64-bit values, not all combinations of rotates and masks are
+ // available. Produce a rotate-mask-and-insert if one is available.
+ SDValue SelectRotMaskIns64(SDValue Base, SDValue V, SDLoc dl, unsigned RLAmt,
+ bool Repl32, unsigned MaskStart,
+ unsigned MaskEnd, unsigned *InstCnt = nullptr) {
+ // In the notation used by the instructions, 'start' and 'end' are reversed
+ // because bits are counted from high to low order.
+ unsigned InstMaskStart = 64 - MaskEnd - 1,
+ InstMaskEnd = 64 - MaskStart - 1;
+
+ if (InstCnt) *InstCnt += 1;
+
+ if (Repl32) {
+ // This rotation amount assumes that the lower 32 bits of the quantity
+ // are replicated in the high 32 bits by the rotation operator (which is
+ // done by rlwinm and friends).
+ assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+ assert(InstMaskEnd >= 32 && "Mask cannot end out of range");
+ SDValue Ops[] =
+ { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart - 32),
+ getI32Imm(InstMaskEnd - 32) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
+ Ops), 0);
+ }
+
+ if (InstMaskEnd == 63 - RLAmt) {
+ SDValue Ops[] =
+ { Base, V, getI32Imm(RLAmt), getI32Imm(InstMaskStart) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
+ }
+
+ // We cannot do this with a single instruction, so we'll use two. The
+ // problem is that we're not free to choose both a rotation amount and mask
+ // start and end independently. We can choose an arbitrary mask start and
+ // end, but then the rotation amount is fixed. Rotation, however, can be
+ // inverted, and so by applying an "inverse" rotation first, we can get the
+ // desired result.
+ if (InstCnt) *InstCnt += 1;
+
+ // The rotation mask for the second instruction must be MaskStart.
+ unsigned RLAmt2 = MaskStart;
+ // The first instruction must rotate V so that the overall rotation amount
+ // is RLAmt.
+ unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+ if (RLAmt1)
+ V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+ return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
+ }
+
+ void SelectAndParts64(SDLoc dl, SDValue &Res, unsigned *InstCnt) {
+ if (BPermRewriterNoMasking)
+ return;
+
+ // The idea here is the same as in the 32-bit version, but with additional
+ // complications from the fact that Repl32 might be true. Because we
+ // aggressively convert bit groups to Repl32 form (which, for small
+ // rotation factors, involves no other change), and then coalesce, it might
+ // be the case that a single 64-bit masking operation could handle both
+ // some Repl32 groups and some non-Repl32 groups. If converting to Repl32
+ // form allowed coalescing, then we must use a 32-bit rotaton in order to
+ // completely capture the new combined bit group.
+
+ for (ValueRotInfo &VRI : ValueRotsVec) {
+ uint64_t Mask = 0;
+
+ // We need to add to the mask all bits from the associated bit groups.
+ // If Repl32 is false, we need to add bits from bit groups that have
+ // Repl32 true, but are trivially convertable to Repl32 false. Such a
+ // group is trivially convertable if it overlaps only with the lower 32
+ // bits, and the group has not been coalesced.
+ auto MatchingBG = [VRI](BitGroup &BG) {
+ if (VRI.V != BG.V)
+ return false;
+
+ unsigned EffRLAmt = BG.RLAmt;
+ if (!VRI.Repl32 && BG.Repl32) {
+ if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx &&
+ !BG.Repl32Coalesced) {
+ if (BG.Repl32CR)
+ EffRLAmt += 32;
+ } else {
+ return false;
+ }
+ } else if (VRI.Repl32 != BG.Repl32) {
+ return false;
+ }
+
+ if (VRI.RLAmt != EffRLAmt)
+ return false;
+
+ return true;
+ };
+
+ for (auto &BG : BitGroups) {
+ if (!MatchingBG(BG))
+ continue;
+
+ if (BG.StartIdx <= BG.EndIdx) {
+ for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i)
+ Mask |= (1ul << i);
+ } else {
+ for (unsigned i = BG.StartIdx; i < Bits.size(); ++i)
+ Mask |= (1ul << i);
+ for (unsigned i = 0; i <= BG.EndIdx; ++i)
+ Mask |= (1ul << i);
+ }
+ }
+
+ // We can use the 32-bit andi/andis technique if the mask does not
+ // require any higher-order bits. This can save an instruction compared
+ // to always using the general 64-bit technique.
+ bool Use32BitInsts = isUInt<32>(Mask);
+ // Compute the masks for andi/andis that would be necessary.
+ unsigned ANDIMask = (Mask & UINT16_MAX),
+ ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+ bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask));
+
+ unsigned NumAndInsts = (unsigned) NeedsRotate +
+ (unsigned) (bool) Res;
+ if (Use32BitInsts)
+ NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+ else
+ NumAndInsts += SelectInt64Count(Mask) + /* and */ 1;
+
+ unsigned NumRLInsts = 0;
+ bool FirstBG = true;
+ for (auto &BG : BitGroups) {
+ if (!MatchingBG(BG))
+ continue;
+ NumRLInsts +=
+ SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx,
+ !FirstBG);
+ FirstBG = false;
+ }
+
+ DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+ " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
+ "\n\t\t\tisel using masking: " << NumAndInsts <<
+ " using rotates: " << NumRLInsts << "\n");
+
+ // When we'd use andi/andis, we bias toward using the rotates (andi only
+ // has a record form, and is cracked on POWER cores). However, when using
+ // general 64-bit constant formation, bias toward the constant form,
+ // because that exposes more opportunities for CSE.
+ if (NumAndInsts > NumRLInsts)
+ continue;
+ if (Use32BitInsts && NumAndInsts == NumRLInsts)
+ continue;
+
+ DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+ if (InstCnt) *InstCnt += NumAndInsts;
+
+ SDValue VRot;
+ // We actually need to generate a rotation if we have a non-zero rotation
+ // factor or, in the Repl32 case, if we care about any of the
+ // higher-order replicated bits. In the latter case, we generate a mask
+ // backward so that it actually includes the entire 64 bits.
+ if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)))
+ VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+ VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63);
+ else
+ VRot = VRI.V;
+
+ SDValue TotalVal;
+ if (Use32BitInsts) {
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in mask when using 32-bit ands for 64-bit value");
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+ VRot, getI32Imm(ANDIMask)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+ VRot, getI32Imm(ANDISMask)), 0);
+
+ if (!ANDIVal)
+ TotalVal = ANDISVal;
+ else if (!ANDISVal)
+ TotalVal = ANDIVal;
+ else
+ TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ ANDIVal, ANDISVal), 0);
+ } else {
+ TotalVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+ TotalVal =
+ SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+ VRot, TotalVal), 0);
+ }
+
+ if (!Res)
+ Res = TotalVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ Res, TotalVal), 0);
+
+ // Now, remove all groups with this underlying value and rotation
+ // factor.
+ for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+ if (MatchingBG(*I))
+ I = BitGroups.erase(I);
+ else
+ ++I;
+ }
+ }
+ }
+
+ // Instruction selection for the 64-bit case.
+ SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) {
+ SDLoc dl(N);
+ SDValue Res;
+
+ if (InstCnt) *InstCnt = 0;
+
+ // Take care of cases that should use andi/andis first.
+ SelectAndParts64(dl, Res, InstCnt);
+
+ // If we've not yet selected a 'starting' instruction, and we have no zeros
+ // to fill in, select the (Value, RLAmt) with the highest priority (largest
+ // number of groups), and start with this rotated value.
+ if ((!HasZeros || LateMask) && !Res) {
+ // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
+ // groups will come first, and so the VRI representing the largest number
+ // of groups might not be first (it might be the first Repl32 groups).
+ unsigned MaxGroupsIdx = 0;
+ if (!ValueRotsVec[0].Repl32) {
+ for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i)
+ if (ValueRotsVec[i].Repl32) {
+ if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups)
+ MaxGroupsIdx = i;
+ break;
+ }
+ }
+
+ ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx];
+ bool NeedsRotate = false;
+ if (VRI.RLAmt) {
+ NeedsRotate = true;
+ } else if (VRI.Repl32) {
+ for (auto &BG : BitGroups) {
+ if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt ||
+ BG.Repl32 != VRI.Repl32)
+ continue;
+
+ // We don't need a rotate if the bit group is confined to the lower
+ // 32 bits.
+ if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx)
+ continue;
+
+ NeedsRotate = true;
+ break;
+ }
+ }
+
+ if (NeedsRotate)
+ Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+ VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63,
+ InstCnt);
+ else
+ Res = VRI.V;
+
+ // Now, remove all groups with this underlying value and rotation factor.
+ if (Res)
+ for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+ if (I->V == VRI.V && I->RLAmt == VRI.RLAmt && I->Repl32 == VRI.Repl32)
+ I = BitGroups.erase(I);
+ else
+ ++I;
+ }
+ }
+
+ // Because 64-bit rotates are more flexible than inserts, we might have a
+ // preference regarding which one we do first (to save one instruction).
+ if (!Res)
+ for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) {
+ if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+ false) <
+ SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+ true)) {
+ if (I != BitGroups.begin()) {
+ BitGroup BG = *I;
+ BitGroups.erase(I);
+ BitGroups.insert(BitGroups.begin(), BG);
+ }
+
+ break;
+ }
+ }
+
+ // Insert the other groups (one at a time).
+ for (auto &BG : BitGroups) {
+ if (!Res)
+ Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx,
+ BG.EndIdx, InstCnt);
+ else
+ Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32,
+ BG.StartIdx, BG.EndIdx, InstCnt);
+ }
+
+ if (LateMask) {
+ uint64_t Mask = getZerosMask();
+
+ // We can use the 32-bit andi/andis technique if the mask does not
+ // require any higher-order bits. This can save an instruction compared
+ // to always using the general 64-bit technique.
+ bool Use32BitInsts = isUInt<32>(Mask);
+ // Compute the masks for andi/andis that would be necessary.
+ unsigned ANDIMask = (Mask & UINT16_MAX),
+ ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+ if (Use32BitInsts) {
+ assert((ANDIMask != 0 || ANDISMask != 0) &&
+ "No set bits in mask when using 32-bit ands for 64-bit value");
+
+ if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+ (unsigned) (ANDISMask != 0) +
+ (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+ SDValue ANDIVal, ANDISVal;
+ if (ANDIMask != 0)
+ ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+ Res, getI32Imm(ANDIMask)), 0);
+ if (ANDISMask != 0)
+ ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+ Res, getI32Imm(ANDISMask)), 0);
+
+ if (!ANDIVal)
+ Res = ANDISVal;
+ else if (!ANDISVal)
+ Res = ANDIVal;
+ else
+ Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+ ANDIVal, ANDISVal), 0);
+ } else {
+ if (InstCnt) *InstCnt += SelectInt64Count(Mask) + /* and */ 1;
+
+ SDValue MaskVal = SDValue(SelectInt64(CurDAG, dl, Mask), 0);
+ Res =
+ SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+ Res, MaskVal), 0);
+ }
+ }
+
+ return Res.getNode();
+ }
+
+ SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) {
+ // Fill in BitGroups.
+ collectBitGroups(LateMask);
+ if (BitGroups.empty())
+ return nullptr;
+
+ // For 64-bit values, figure out when we can use 32-bit instructions.
+ if (Bits.size() == 64)
+ assignRepl32BitGroups();
+
+ // Fill in ValueRotsVec.
+ collectValueRotInfo();
+
+ if (Bits.size() == 32) {
+ return Select32(N, LateMask, InstCnt);
+ } else {
+ assert(Bits.size() == 64 && "Not 64 bits here?");
+ return Select64(N, LateMask, InstCnt);
+ }
+
+ return nullptr;
+ }
+
SmallVector<ValueBit, 64> Bits;
bool HasZeros;
// Fill it RLAmt and set HasZeros.
computeRotationAmounts();
- // Fill in BitGroups.
- collectBitGroups();
- if (BitGroups.empty())
- return nullptr;
+ if (!HasZeros)
+ return Select(N, false);
- // Fill in ValueRotsVec.
- collectValueRotInfo();
+ // We currently have two techniques for handling results with zeros: early
+ // masking (the default) and late masking. Late masking is sometimes more
+ // efficient, but because the structure of the bit groups is different, it
+ // is hard to tell without generating both and comparing the results. With
+ // late masking, we ignore zeros in the resulting value when inserting each
+ // set of bit groups, and then mask in the zeros at the end. With early
+ // masking, we only insert the non-zero parts of the result at every step.
- if (Bits.size() == 32) {
- return Select32(N);
- } else {
- assert(Bits.size() == 64 && "Not 64 bits here?");
- // TODO: The 64-bit case!
+ unsigned InstCnt, InstCntLateMask;
+ DEBUG(dbgs() << "\tEarly masking:\n");
+ SDNode *RN = Select(N, false, &InstCnt);
+ DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+
+ DEBUG(dbgs() << "\tLate masking:\n");
+ SDNode *RNLM = Select(N, true, &InstCntLateMask);
+ DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
+ " instructions\n");
+
+ if (InstCnt <= InstCntLateMask) {
+ DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+ return RN;
}
- return nullptr;
+ DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+ return RNLM;
}
};
} // anonymous namespace
N->getValueType(0) != MVT::i64)
return nullptr;
+ if (!UseBitPermRewriter)
+ return nullptr;
+
switch (N->getOpcode()) {
default: break;
case ISD::ROTL:
default: break;
case ISD::Constant: {
- if (N->getValueType(0) == MVT::i64) {
- // Get 64 bit value.
- int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
- // Assume no remaining bits.
- unsigned Remainder = 0;
- // Assume no shift required.
- unsigned Shift = 0;
-
- // If it can't be represented as a 32 bit value.
- if (!isInt<32>(Imm)) {
- Shift = countTrailingZeros<uint64_t>(Imm);
- int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
-
- // If the shifted value fits 32 bits.
- if (isInt<32>(ImmSh)) {
- // Go with the shifted value.
- Imm = ImmSh;
- } else {
- // Still stuck with a 64 bit value.
- Remainder = Imm;
- Shift = 32;
- Imm >>= 32;
- }
- }
-
- // Intermediate operand.
- SDNode *Result;
-
- // Handle first 32 bits.
- unsigned Lo = Imm & 0xFFFF;
- unsigned Hi = (Imm >> 16) & 0xFFFF;
-
- // Simple value.
- if (isInt<16>(Imm)) {
- // Just the Lo bits.
- Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
- } else if (Lo) {
- // Handle the Hi bits.
- unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
- Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
- // And Lo bits.
- Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
- SDValue(Result, 0), getI32Imm(Lo));
- } else {
- // Just the Hi bits.
- Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
- }
-
- // If no shift, we're done.
- if (!Shift) return Result;
-
- // Shift for next step if the upper 32-bits were not zero.
- if (Imm) {
- Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
- SDValue(Result, 0),
- getI32Imm(Shift),
- getI32Imm(63 - Shift));
- }
-
- // Add in the last bits as required.
- if ((Hi = (Remainder >> 16) & 0xFFFF)) {
- Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
- SDValue(Result, 0), getI32Imm(Hi));
- }
- if ((Lo = Remainder & 0xFFFF)) {
- Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
- SDValue(Result, 0), getI32Imm(Lo));
- }
-
- return Result;
- }
+ if (N->getValueType(0) == MVT::i64)
+ return SelectInt64(CurDAG, N);
break;
}
; CHECK: blr
}
+define i64 @bs8(i64 %x) #0 {
+entry:
+ %0 = tail call i64 @llvm.bswap.i64(i64 %x)
+ ret i64 %0
+
+; CHECK-LABEL: @bs8
+; CHECK-DAG: rldicl [[REG1:[0-9]+]], 3, 16, 0
+; CHECK-DAG: rldicl [[REG2:[0-9]+]], 3, 8, 0
+; CHECK-DAG: rldicl [[REG3:[0-9]+]], 3, 24, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG1]], 8, 48
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 32, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG3]], 16, 40
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 3, 48, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG4]], 24, 32
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 56, 0
+; CHECK-DAG: rldimi [[REG2]], [[REG5]], 40, 16
+; CHECK-DAG: rldimi [[REG2]], [[REG6]], 48, 8
+; CHECK-DAG: rldimi [[REG2]], 3, 56, 0
+; CHECK: mr 3, [[REG2]]
+; CHECK: blr
+}
+
+define i64 @test1(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = lshr i64 %i1, 8
+ %and = and i64 %0, 5963776000
+ ret i64 %and
+
+; CHECK-LABEL: @test1
+; CHECK-DAG: li [[REG1:[0-9]+]], 11375
+; CHECK-DAG: rldicl [[REG3:[0-9]+]], 4, 56, 0
+; CHECK-DAG: sldi [[REG2:[0-9]+]], [[REG1]], 19
+; CHECK: and 3, [[REG3]], [[REG2]]
+; CHECK: blr
+}
+
+define i64 @test2(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = lshr i64 %i1, 6
+ %and = and i64 %0, 133434808670355456
+ ret i64 %and
+
+; CHECK-LABEL: @test2
+; CHECK-DAG: lis [[REG1:[0-9]+]], 474
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 58, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 3648
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464
+; CHECK: and 3, [[REG5]], [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test3(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = shl i64 %i0, 34
+ %and = and i64 %0, 191795733152661504
+ ret i64 %and
+
+; CHECK-LABEL: @test3
+; CHECK-DAG: lis [[REG1:[0-9]+]], 170
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 34, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 22861
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 34
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test4(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = lshr i64 %i1, 15
+ %and = and i64 %0, 58195968
+ ret i64 %and
+
+; CHECK-LABEL: @test4
+; CHECK: rldicl [[REG1:[0-9]+]], 4, 49, 0
+; CHECK: andis. 3, [[REG1]], 888
+; CHECK: blr
+}
+
+define i64 @test5(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = shl i64 %i1, 12
+ %and = and i64 %0, 127252959854592
+ ret i64 %and
+
+; CHECK-LABEL: @test5
+; CHECK-DAG: lis [[REG1:[0-9]+]], 3703
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 4, 12, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 35951
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
; Function Attrs: nounwind readnone
define zeroext i32 @test6(i32 zeroext %x) #0 {
entry:
; CHECK: blr
}
+define i64 @test7(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = lshr i64 %i0, 5
+ %and = and i64 %0, 58195968
+ ret i64 %and
+
+; CHECK-LABEL: @test7
+; CHECK: rlwinm [[REG1:[0-9]+]], 3, 27, 9, 12
+; CHECK: rlwimi [[REG1]], 3, 27, 6, 7
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test8(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = lshr i64 %i0, 1
+ %and = and i64 %0, 169172533248
+ ret i64 %and
+
+; CHECK-LABEL: @test8
+; CHECK-DAG: lis [[REG1:[0-9]+]], 4
+; CHECK-DAG: rldicl [[REG4:[0-9]+]], 3, 63, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 60527
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 19
+; CHECK: and 3, [[REG4]], [[REG3]]
+; CHECK: blr
+}
+
+define i64 @test9(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = lshr i64 %i1, 14
+ %and = and i64 %0, 18848677888
+ %1 = shl i64 %i1, 51
+ %and3 = and i64 %1, 405323966463344640
+ %or4 = or i64 %and, %and3
+ ret i64 %or4
+
+; CHECK-LABEL: @test9
+; CHECK-DAG: lis [[REG1:[0-9]+]], 1440
+; CHECK-DAG: rldicl [[REG5:[0-9]+]], 4, 62, 0
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 4, 50, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 4
+; CHECK-DAG: rldimi [[REG6]], [[REG5]], 53, 0
+; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-DAG: oris [[REG4:[0-9]+]], [[REG3]], 25464
+; CHECK: and 3, [[REG6]], [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test10(i64 %i0, i64 %i1) #0 {
+entry:
+ %0 = shl i64 %i0, 37
+ %and = and i64 %0, 15881483390550016
+ %1 = shl i64 %i0, 25
+ %and3 = and i64 %1, 2473599172608
+ %or4 = or i64 %and, %and3
+ ret i64 %or4
+
+; CHECK-LABEL: @test10
+; CHECK-DAG: lis [[REG1:[0-9]+]], 1
+; CHECK-DAG: rldicl [[REG6:[0-9]+]], 3, 25, 0
+; CHECK-DAG: rldicl [[REG7:[0-9]+]], 3, 37, 0
+; CHECK-DAG: ori [[REG2:[0-9]+]], [[REG1]], 8183
+; CHECK-DAG: ori [[REG3:[0-9]+]], [[REG1]], 50017
+; CHECK-DAG: sldi [[REG4:[0-9]+]], [[REG2]], 25
+; CHECK-DAG: sldi [[REG5:[0-9]+]], [[REG3]], 37
+; CHECK-DAG: and [[REG8:[0-9]+]], [[REG6]], [[REG4]]
+; CHECK-DAG: and [[REG9:[0-9]+]], [[REG7]], [[REG5]]
+; CHECK: or 3, [[REG9]], [[REG8]]
+; CHECK: blr
+}
+
+define i64 @test11(i64 %x) #0 {
+entry:
+ %and = and i64 %x, 4294967295
+ %shl = shl i64 %x, 32
+ %or = or i64 %and, %shl
+ ret i64 %or
+
+; CHECK-LABEL: @test11
+; CHECK: rlwinm 3, 3, 0, 1, 0
+; CHECK: blr
+}
+
+define i64 @test12(i64 %x) #0 {
+entry:
+ %and = and i64 %x, 4294905855
+ %shl = shl i64 %x, 32
+ %or = or i64 %and, %shl
+ ret i64 %or
+
+; CHECK-LABEL: @test12
+; CHECK: rlwinm 3, 3, 0, 20, 15
+; CHECK: blr
+}
+
+define i64 @test13(i64 %x) #0 {
+entry:
+ %shl = shl i64 %x, 4
+ %and = and i64 %shl, 240
+ %shr = lshr i64 %x, 28
+ %and1 = and i64 %shr, 15
+ %or = or i64 %and, %and1
+ ret i64 %or
+
+; CHECK-LABEL: @test13
+; CHECK: rlwinm 3, 3, 4, 24, 31
+; CHECK: blr
+}
+
+define i64 @test14(i64 %x) #0 {
+entry:
+ %shl = shl i64 %x, 4
+ %and = and i64 %shl, 240
+ %shr = lshr i64 %x, 28
+ %and1 = and i64 %shr, 15
+ %and2 = and i64 %x, -4294967296
+ %or = or i64 %and1, %and2
+ %or3 = or i64 %or, %and
+ ret i64 %or3
+
+; CHECK-LABEL: @test14
+; CHECK: rldicr [[REG1:[0-9]+]], 3, 0, 31
+; CHECK: rlwimi [[REG1]], 3, 4, 24, 31
+; CHECK: mr 3, [[REG1]]
+; CHECK: blr
+}
+
+define i64 @test15(i64 %x) #0 {
+entry:
+ %shl = shl i64 %x, 4
+ %and = and i64 %shl, 240
+ %shr = lshr i64 %x, 28
+ %and1 = and i64 %shr, 15
+ %and2 = and i64 %x, -256
+ %or = or i64 %and1, %and2
+ %or3 = or i64 %or, %and
+ ret i64 %or3
+
+; CHECK-LABEL: @test15
+; CHECK: rlwimi 3, 3, 4, 24, 31
+; CHECK: blr
+}
+
; Function Attrs: nounwind readnone
declare i32 @llvm.bswap.i32(i32) #0
+declare i64 @llvm.bswap.i64(i64) #0
attributes #0 = { nounwind readnone }