X86 CodeGenPrep: sink shufflevectors before shifts

author Tim Northover <tnorthover@apple.com>

Wed, 19 Feb 2014 10:02:43 +0000 (10:02 +0000)

committer Tim Northover <tnorthover@apple.com>

Wed, 19 Feb 2014 10:02:43 +0000 (10:02 +0000)
author Tim Northover <tnorthover@apple.com>
Wed, 19 Feb 2014 10:02:43 +0000 (10:02 +0000)
committer Tim Northover <tnorthover@apple.com>
Wed, 19 Feb 2014 10:02:43 +0000 (10:02 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 155bb0a7a58cd28b1a1ed8f8f92582e7c19f1a74..0b3428ed7a6a791f0a83a43e39cfb3fe4df2ded5 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1185,6 +1185,14 @@ public:
      return true;
    }
  
+  /// Return true if it's significantly cheaper to shift a vector by a uniform
+  /// scalar than by an amount which will vary across each lane. On x86, for
+  /// example, there is a "psllw" instruction for the former case, but no simple
+  /// instruction for a general "a << b" operation on vectors.
+  virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
+    return false;
+  }
+
    /// Return true if it's free to truncate a value of type Ty1 to type
    /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
    /// by referencing its sub-register AX.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index b1d734e932b5159fcaf657319eb8db6f3a7e5655..f038580c5f986595409a7876f3b1d21ed13c2d53 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -14172,6 +14172,24 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
    return true;
  }
  
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+  unsigned Bits = Ty->getScalarSizeInBits();
+
+  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+  // particularly cheaper than those without.
+  if (Bits == 8)
+    return false;
+
+  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+  // variable shifts just as cheap as scalar ones.
+  if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+    return false;
+
+  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+  // fully general vector.
+  return true;
+}
+
  bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
    if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
      return false;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index 6ea060ba3bc1c2339860f385eb80b56be18b72dd..ce9594ae3ed657c6d9d79fb501b10a876f6896e5 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -679,6 +679,9 @@ namespace llvm {
      /// the immediate into a register.
      virtual bool isLegalAddImmediate(int64_t Imm) const;
  
+
+    virtual bool isVectorShiftByScalarCheap(Type *Ty) const;
+
      /// isTruncateFree - Return true if it's free to truncate a value of
      /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
      /// register EAX to i16 by referencing its sub-register AX.
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp

index 0fde256943d12bdc88d05099588d385fe4641952..3c9ecce8e3e2eb14f7fdb7e63f9d3ccd5bc69603 100644 (file)
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -132,6 +132,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
      bool MoveExtToFormExtLoad(Instruction *I);
      bool OptimizeExtUses(Instruction *I);
      bool OptimizeSelectInst(SelectInst *SI);
+    bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
      bool DupRetToEnableTailCallOpts(BasicBlock *BB);
      bool PlaceDbgValues(Function &F);
    };
@@ -2719,6 +2720,74 @@ bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
    return true;
  }
  
+
+bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
+  SmallVector<int, 16> Mask(SVI->getShuffleMask());
+  int SplatElem = -1;
+  for (unsigned i = 0; i < Mask.size(); ++i) {
+    if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
+      return false;
+    SplatElem = Mask[i];
+  }
+
+  return true;
+}
+
+/// Some targets have expensive vector shifts if the lanes aren't all the same
+/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
+/// it's often worth sinking a shufflevector splat down to its use so that
+/// codegen can spot all lanes are identical.
+bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+  BasicBlock *DefBB = SVI->getParent();
+
+  // Only do this xform if variable vector shifts are particularly expensive.
+  if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
+    return false;
+
+  // We only expect better codegen by sinking a shuffle if we can recognise a
+  // constant splat.
+  if (!isBroadcastShuffle(SVI))
+    return false;
+
+  // InsertedShuffles - Only insert a shuffle in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = SVI->use_begin(), E = SVI->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+
+    // For now only apply this when the splat is used by a shift instruction.
+    if (!User->isShift()) continue;
+
+    // Everything checks out, sink the shuffle if the user's block doesn't
+    // already have a copy.
+    Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
+
+    if (!InsertedShuffle) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0),
+                                              SVI->getOperand(1),
+                                              SVI->getOperand(2), "", InsertPt);
+    }
+
+    User->replaceUsesOfWith(SVI, InsertedShuffle);
+    MadeChange = true;
+  }
+
+  // If we removed all uses, nuke the shuffle.
+  if (SVI->use_empty()) {
+    SVI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
  bool CodeGenPrepare::OptimizeInst(Instruction *I) {
    if (PHINode *P = dyn_cast<PHINode>(I)) {
      // It is possible for very late stage optimizations (such as SimplifyCFG)
@@ -2791,6 +2860,9 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
    if (SelectInst *SI = dyn_cast<SelectInst>(I))
      return OptimizeSelectInst(SI);
  
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+    return OptimizeShuffleVectorInst(SVI);
+
    return false;
  }
  
diff --git a/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll b/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll

new file mode 100644 (file)

index 0000000..c4ee79c
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll
@@ -0,0 +1,102 @@
+; RUN: opt -S -codegenprepare -mtriple=x86_64-apple-macosx10.9 -mcpu=core-avx2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX2
+; RUN: opt -S -codegenprepare -mtriple=x86_64-apple-macosx10.9 -mcpu=corei7 %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SSE2
+
+define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_8bit
+; CHECK: if_true:
+; CHECK-NOT: shufflevector
+
+; CHECK: if_false:
+; CHECK-NOT: shufflevector
+; CHECK: shl <16 x i8> %lhs, %mask
+  %mask = shufflevector <16 x i8> %tmp, <16 x i8> undef, <16 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <16 x i8> %mask
+
+if_false:
+  %res = shl <16 x i8> %lhs, %mask
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_16bit
+; CHECK: if_true:
+; CHECK-NOT: shufflevector
+
+; CHECK: if_false:
+; CHECK: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector
+; CHECK: shl <8 x i16> %lhs, [[SPLAT]]
+  %mask = shufflevector <8 x i16> %tmp, <8 x i16> undef, <8 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <8 x i16> %mask
+
+if_false:
+  %res = shl <8 x i16> %lhs, %mask
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_notsplat
+; CHECK: if_true:
+; CHECK-NOT: shufflevector
+
+; CHECK: if_false:
+; CHECK-NOT: shufflevector
+; CHECK: shl <4 x i32> %lhs, %mask
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <4 x i32> %mask
+
+if_false:
+  %res = shl <4 x i32> %lhs, %mask
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
+; CHECK-AVX2-LABEL: @test_32bit
+; CHECK-AVX2: if_false:
+; CHECK-AVX2-NOT: shufflevector
+; CHECK-AVX2: ashr <4 x i32> %lhs, %mask
+
+; CHECK-SSE2-LABEL: @test_32bit
+; CHECK-SSE2: if_false:
+; CHECK-SSE2: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector
+; CHECK-SSE2: ashr <4 x i32> %lhs, [[SPLAT]]
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <4 x i32> %mask
+
+if_false:
+  %res = ashr <4 x i32> %lhs, %mask
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
+; CHECK-AVX2-LABEL: @test_64bit
+; CHECK-AVX2: if_false:
+; CHECK-AVX2-NOT: shufflevector
+; CHECK-AVX2: lshr <2 x i64> %lhs, %mask
+
+; CHECK-SSE2-LABEL: @test_64bit
+; CHECK-SSE2: if_false:
+; CHECK-SSE2: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector
+; CHECK-SSE2: lshr <2 x i64> %lhs, [[SPLAT]]
+
+  %mask = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <2 x i64> %mask
+
+if_false:
+  %res = lshr <2 x i64> %lhs, %mask
+  ret <2 x i64> %res
+}
author	Tim Northover <tnorthover@apple.com>
	Wed, 19 Feb 2014 10:02:43 +0000 (10:02 +0000)
committer	Tim Northover <tnorthover@apple.com>
	Wed, 19 Feb 2014 10:02:43 +0000 (10:02 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
lib/Transforms/Scalar/CodeGenPrepare.cpp		patch \| blob \| history
test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll	[new file with mode: 0644]	patch \| blob