}
}
-
// Return true if I should be commuted before adding it's left and right
// operands to the arrays Left and Right.
//
// a splat to lower the vectorizing cost.
static bool shouldReorderOperands(int i, Instruction &I,
SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right) {
+ SmallVectorImpl<Value *> &Right,
+ bool AllSameOpcodeLeft,
+ bool AllSameOpcodeRight, bool SplatLeft,
+ bool SplatRight) {
Value *VLeft = I.getOperand(0);
Value *VRight = I.getOperand(1);
+ // If we have "SplatRight", try to see if commuting is needed to preserve it.
+ if (SplatRight) {
+ if (VRight == Right[i - 1])
+ // Preserve SplatRight
+ return false;
+ if (VLeft == Right[i - 1]) {
+ // Commuting would preserve SplatRight, but we don't want to break
+ // SplatLeft either, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (SplatLeft && VLeft == Left[i - 1])
+ return false;
+ return true;
+ }
+ }
+ // Symmetrically handle Right side.
+ if (SplatLeft) {
+ if (VLeft == Left[i - 1])
+ // Preserve SplatLeft
+ return false;
+ if (VRight == Left[i - 1])
+ return true;
+ }
+
Instruction *ILeft = dyn_cast<Instruction>(VLeft);
Instruction *IRight = dyn_cast<Instruction>(VRight);
- // Sort two opcodes. In the code below we try to preserve the ability to use
- // broadcast of values instead of individual inserts.
- // vl1 = load
- // vl2 = phi
- // vr1 = load
- // vr2 = vr2
- // = vl1 x vr1
- // = vl2 x vr2
- // If we just sorted according to opcode we would leave the first line in
- // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
- // = vl1 x vr1
- // = vr2 x vl2
- // Because vr2 and vr1 are from the same load we loose the opportunity of a
- // broadcast for the packed right side in the backend: we have [vr1, vl2]
- // instead of [vr1, vr2=vr1].
- if (ILeft && IRight) {
- if (ILeft->getOpcode() > IRight->getOpcode() &&
- Right[i - 1] != IRight) {
- // Try not to destroy a broad cast for no apparent benefit.
- return true;
- } else if (ILeft->getOpcode() == IRight->getOpcode() &&
- Right[i - 1] == ILeft) {
- // Try preserve broadcasts.
- return true;
- } else if (ILeft->getOpcode() == IRight->getOpcode() &&
- Left[i - 1] == IRight) {
- // Try preserve broadcasts.
+ // If we have "AllSameOpcodeRight", try to see if the left operands preserves
+ // it and not the right, in this case we want to commute.
+ if (AllSameOpcodeRight) {
+ unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
+ if (IRight && RightPrevOpcode == IRight->getOpcode())
+ // Do not commute, a match on the right preserves AllSameOpcodeRight
+ return false;
+ if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
+ // We have a match and may want to commute, but first check if there is
+ // not also a match on the existing operands on the Left to preserve
+ // AllSameOpcodeLeft, i.e. preserve the original order if possible.
+ // (FIXME: why do we care?)
+ if (AllSameOpcodeLeft && ILeft &&
+ cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
+ return false;
return true;
}
- return false;
}
- // One opcode, put the instruction on the right.
- return ILeft != nullptr;
+ // Symmetrically handle Left side.
+ if (AllSameOpcodeLeft) {
+ unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
+ if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
+ return false;
+ if (IRight && LeftPrevOpcode == IRight->getOpcode())
+ return true;
+ }
+ return false;
}
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
- SmallVector<Value *, 16> OrigLeft, OrigRight;
-
if (VL.size()) {
// Peel the first iteration out of the loop since there's nothing
- // interesting to do anyway and it simplifies the checks
+ // interesting to do anyway and it simplifies the checks in the loop.
auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
auto VRight = cast<Instruction>(VL[0])->getOperand(1);
- OrigLeft.push_back(VLeft);
- OrigRight.push_back(VRight);
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
// Favor having instruction to the right. FIXME: why?
std::swap(VLeft, VRight);
// Keep track if we have instructions with all the same opcode on one side.
bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
+ // Keep track if we have one side with all the same value (broadcast).
+ bool SplatLeft = true;
+ bool SplatRight = true;
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
Instruction *I = cast<Instruction>(VL[i]);
-
- Value *VLeft = I->getOperand(0);
- Value *VRight = I->getOperand(1);
- OrigLeft.push_back(VLeft);
- OrigRight.push_back(VRight);
- Instruction *ILeft = dyn_cast<Instruction>(VLeft);
- Instruction *IRight = dyn_cast<Instruction>(VRight);
-
- // Check whether all operands on one side have the same opcode. In this case
- // we want to preserve the original order and not make things worse by
- // reordering.
- if (AllSameOpcodeLeft && ILeft) {
- if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
- if (PLeft->getOpcode() != ILeft->getOpcode())
- AllSameOpcodeLeft = false;
- } else
- AllSameOpcodeLeft = false;
- }
- if (AllSameOpcodeRight && IRight) {
- if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
- if (PRight->getOpcode() != IRight->getOpcode())
- AllSameOpcodeRight = false;
- } else
- AllSameOpcodeRight = false;
- }
-
-
+ assert(I->isCommutative() && "Can only process commutative instruction");
// Commute to favor either a splat or maximizing having the same opcodes on
// one side.
- if (shouldReorderOperands(i, *I, Left, Right)) {
+ if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
+ AllSameOpcodeRight, SplatLeft, SplatRight)) {
Left.push_back(I->getOperand(1));
Right.push_back(I->getOperand(0));
} else {
Left.push_back(I->getOperand(0));
Right.push_back(I->getOperand(1));
}
+ // Update Splat* and AllSameOpcode* after the insertion.
+ SplatRight = SplatRight && (Right[i - 1] == Right[i]);
+ SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
+ AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
+ (cast<Instruction>(Left[i - 1])->getOpcode() ==
+ cast<Instruction>(Left[i])->getOpcode());
+ AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
+ (cast<Instruction>(Right[i - 1])->getOpcode() ==
+ cast<Instruction>(Right[i])->getOpcode());
}
- bool LeftBroadcast = isSplat(Left);
- bool RightBroadcast = isSplat(Right);
-
- // If operands end up being broadcast return this operand order.
- if (LeftBroadcast || RightBroadcast)
+ // If one operand end up being broadcast, return this operand order.
+ if (SplatRight || SplatLeft)
return;
- // Don't reorder if the operands where good to begin.
- if (AllSameOpcodeRight || AllSameOpcodeLeft) {
- Left = OrigLeft;
- Right = OrigRight;
- }
-
const DataLayout &DL = F->getParent()->getDataLayout();
// Finally check if we can get longer vectorizable chain by reordering
--- /dev/null
+; RUN: opt -slp-vectorizer < %s -S | FileCheck %s
+
+; Verify that the SLP vectorizer is able to figure out that commutativity
+; offers the possibility to splat/broadcast %c and thus make it profitable
+; to vectorize this case
+
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+@cle = external unnamed_addr global [32 x i8], align 16
+@cle32 = external unnamed_addr global [32 x i32], align 16
+
+
+; Check that we correctly detect a splat/broadcast by leveraging the
+; commutativity property of `xor`.
+
+; CHECK-LABEL: @splat
+; CHECK: store <16 x i8>
+define void @splat(i8 %a, i8 %b, i8 %c) {
+ %1 = xor i8 %c, %a
+ store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
+ %2 = xor i8 %a, %c
+ store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
+ %3 = xor i8 %a, %c
+ store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
+ %4 = xor i8 %a, %c
+ store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
+ %5 = xor i8 %c, %a
+ store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
+ %6 = xor i8 %c, %b
+ store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
+ %7 = xor i8 %c, %a
+ store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
+ %8 = xor i8 %c, %b
+ store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
+ %9 = xor i8 %a, %c
+ store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
+ %10 = xor i8 %a, %c
+ store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
+ %11 = xor i8 %a, %c
+ store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
+ %12 = xor i8 %a, %c
+ store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
+ %13 = xor i8 %a, %c
+ store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
+ %14 = xor i8 %a, %c
+ store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
+ %15 = xor i8 %a, %c
+ store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
+ %16 = xor i8 %a, %c
+ store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
+ ret void
+}
+
+
+
+; Check that we correctly detect that we can have the same opcode on one side by
+; leveraging the commutativity property of `xor`.
+
+; CHECK-LABEL: @same_opcode_on_one_side
+; CHECK: store <4 x i32>
+define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
+ %add1 = add i32 %c, %a
+ %add2 = add i32 %c, %a
+ %add3 = add i32 %a, %c
+ %add4 = add i32 %c, %a
+ %1 = xor i32 %add1, %a
+ store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
+ %2 = xor i32 %b, %add2
+ store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
+ %3 = xor i32 %c, %add3
+ store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
+ %4 = xor i32 %a, %add4
+ store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
+ ret void
+}