[SLPVectorizer] Try different vectorization factors for store chains

author Sanjay Patel <spatel@rotateright.com>

Wed, 8 Jul 2015 23:40:55 +0000 (23:40 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Wed, 8 Jul 2015 23:40:55 +0000 (23:40 +0000)
author Sanjay Patel <spatel@rotateright.com>
Wed, 8 Jul 2015 23:40:55 +0000 (23:40 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Wed, 8 Jul 2015 23:40:55 +0000 (23:40 +0000)
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp

index da35b35248f646747bca14a7ad84e9c9be0b6d9f..7bac407e77e978407d1d91056f62f0c9e7ef2e38 100644 (file)
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
      cl::desc(
          "Attempt to vectorize horizontal reductions feeding into a store"));
  
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+    cl::desc("Attempt to vectorize for this register size in bits"));
+
  namespace {
  
+// FIXME: Set this via cl::opt to allow overriding.
  static const unsigned MinVecRegSize = 128;
  
  static const unsigned RecursionMaxDepth = 12;
@@ -3088,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass {
      if (!TTI->getNumberOfRegisters(true))
        return false;
  
+    // Use the vector register size specified by the target unless overridden
+    // by a command-line option.
+    // TODO: It would be better to limit the vectorization factor based on
+    //       data type rather than just register size. For example, x86 AVX has
+    //       256-bit registers, but it does not support integer operations
+    //       at that width (that requires AVX2).
+    if (MaxVectorRegSizeOption.getNumOccurrences())
+      MaxVecRegSize = MaxVectorRegSizeOption;
+    else
+      MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
      // Don't vectorize when the attribute NoImplicitFloat is used.
      if (F.hasFnAttribute(Attribute::NoImplicitFloat))
        return false;
@@ -3165,12 +3181,13 @@ private:
    bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
  
    bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
-                           BoUpSLP &R);
+                           BoUpSLP &R, unsigned VecRegSize);
  
    bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
                         BoUpSLP &R);
  private:
    StoreListMap StoreRefs;
+  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
  };
  
  /// \brief Check that the Values in the slice in VL array are still existent in
@@ -3185,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
  }
  
  bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
-                                          int CostThreshold, BoUpSLP &R) {
+                                        int CostThreshold, BoUpSLP &R,
+                                        unsigned VecRegSize) {
    unsigned ChainLen = Chain.size();
    DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
          << "\n");
    Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
    auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
    unsigned Sz = DL.getTypeSizeInBits(StoreTy);
-  unsigned VF = MinVecRegSize / Sz;
+  unsigned VF = VecRegSize / Sz;
  
    if (!isPowerOf2_32(Sz) || VF < 2)
      return false;
@@ -3276,10 +3294,15 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
        I = ConsecutiveChain[I];
      }
  
-    if (vectorizeStoreChain(Operands, costThreshold, R)) {
-      // Mark the vectorized stores so that we don't vectorize them again.
-      VectorizedStores.insert(Operands.begin(), Operands.end());
-      Changed = true;
+    // FIXME: Is division-by-2 the correct step? Should we assert that the
+    // register size is a power-of-2?
+    for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+      if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+        // Mark the vectorized stores so that we don't vectorize them again.
+        VectorizedStores.insert(Operands.begin(), Operands.end());
+        Changed = true;
+        break;
+      }
      }
    }
  
@@ -3340,6 +3363,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  
    Type *Ty0 = I0->getType();
    unsigned Sz = DL.getTypeSizeInBits(Ty0);
+  // FIXME: Register size should be a parameter to this function, so we can
+  // try different vectorization factors.
    unsigned VF = MinVecRegSize / Sz;
  
    for (Value *V : VL) {
@@ -3569,6 +3594,8 @@ public:
      const DataLayout &DL = B->getModule()->getDataLayout();
      ReductionOpcode = B->getOpcode();
      ReducedValueOpcode = 0;
+    // FIXME: Register size should be a parameter to this function, so we can
+    // try different vectorization factors.
      ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
      ReductionRoot = B;
      ReductionPHI = Phi;
@@ -3995,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
            << it->second.size() << ".\n");
  
      // Process the stores in chunks of 16.
+    // TODO: The limit of 16 inhibits greater vectorization factors.
+    //       For example, AVX2 supports v32i8. Increasing this limit, however,
+    //       may cause a significant compile-time increase.
      for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
        unsigned Len = std::min<unsigned>(CE - CI, 16);
        Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
diff --git a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll

index 9ed86f88147399d42a18aadb8963f3a34dfe030c..35763953911b5faf3bd5c22eec8d83833e563fd9 100644 (file)
--- a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
@@ -1,4 +1,9 @@
  ; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
+; XFAIL: *
+; 
+; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
+;        has vector registers of the expected width.
+;        Currently, it says there are 8 vector registers that are 32-bits wide.
  
  target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
  
diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll

index 00e1ecd4ad327b97c1e63c7c39eb45296e859497..8d25b3661dc3785e8871890ba05f8fef7b36449b 100644 (file)
--- a/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -12,11 +12,8 @@ target triple = "i386-apple-macosx10.8.0"
  
  ;CHECK-LABEL: @test(
  ;CHECK: load <2 x double>
-;CHECK: fadd <2 x double>
-;CHECK: store <2 x double>
-;CHECK: insertelement <2 x double>
-;CHECK: fadd <2 x double>
-;CHECK: store <2 x double>
+;CHECK: fadd <4 x double>
+;CHECK: store <4 x double>
  ;CHECK: ret i32
  
  define i32 @test(double* nocapture %G) {
@@ -48,11 +45,12 @@ entry:
  ;  A[2] = A[2] * 7.6 * n + 3.0;
  ;  A[3] = A[3] * 7.4 * n + 4.0;
  ;}
-;CHECK-LABEL: @foo(
-;CHECK: insertelement <2 x double>
-;CHECK: insertelement <2 x double>
-;CHECK-NOT: insertelement <2 x double>
-;CHECK: ret
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: store <4 x double>
  define i32 @foo(double* nocapture %A, i32 %n) {
  entry:
    %0 = load double, double* %A, align 8
@@ -140,11 +138,12 @@ define i32 @test2(double* nocapture %G, i32 %k) {
  ;  A[2] = A[2] * 7.9 * n + 6.0;
  ;  A[3] = A[3] * 7.9 * n + 6.0;
  ;}
-;CHECK-LABEL: @foo4(
-;CHECK: insertelement <2 x double>
-;CHECK: insertelement <2 x double>
-;CHECK-NOT: insertelement <2 x double>
-;CHECK: ret
+; CHECK-LABEL: @foo4(
+; CHECK: load <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fmul <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: store <4 x double>
  define i32 @foo4(double* nocapture %A, i32 %n) {
  entry:
    %0 = load double, double* %A, align 8
diff --git a/test/Transforms/SLPVectorizer/X86/gep.ll b/test/Transforms/SLPVectorizer/X86/gep.ll

index 3f952d7b242b2053aeef06e07b40352891a7bca4..d10f2b6015d4aafc6979faeda4b3541d0b5284a2 100644 (file)
--- a/test/Transforms/SLPVectorizer/X86/gep.ll
+++ b/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -1,5 +1,6 @@
  ; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
  
  ; Test if SLP can handle GEP expressions.
  ; The test perform the following action:
diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll

index bbc315c24e17747cd947fd994da339ceb0d510c6..dace4b35b8711d6248aec8934b5c8fba50a14503 100644 (file)
--- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@@ -4,12 +4,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  target triple = "x86_64-apple-macosx10.8.0"
  
  ;CHECK-LABEL: @foo(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
+;CHECK: load <8 x i32>
+;CHECK: add nsw <8 x i32>
+;CHECK: store <8 x i32>
  ;CHECK: ret
  define i32 @foo(i32* nocapture %A, i32 %n) {
  entry:
diff --git a/test/Transforms/SLPVectorizer/X86/pr19657.ll b/test/Transforms/SLPVectorizer/X86/pr19657.ll

index 0e404b2ad7bea224dff8c6cf9c4f6708b8946a2f..32f8da4c7ee03a222e08229e72bacba2a5574e7d 100644 (file)
--- a/test/Transforms/SLPVectorizer/X86/pr19657.ll
+++ b/test/Transforms/SLPVectorizer/X86/pr19657.ll
@@ -1,11 +1,24 @@
  ; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-max-reg-size=128 -S -mcpu=corei7-avx | FileCheck %s --check-prefix=V128
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: fadd <2 x double>
-; CHECK: store <2 x double>
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: fadd <4 x double>
+; CHECK: store <4 x double>
+
+; V128-LABEL: @foo(
+; V128: load <2 x double>
+; V128: fadd <2 x double>
+; V128: fadd <2 x double>
+; V128: store <2 x double>
+; V128: load <2 x double>
+; V128: fadd <2 x double>
+; V128: fadd <2 x double>
+; V128: store <2 x double>
  
  define void @foo(double* %x) {
    %1 = load double, double* %x, align 8
author	Sanjay Patel <spatel@rotateright.com>
	Wed, 8 Jul 2015 23:40:55 +0000 (23:40 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Wed, 8 Jul 2015 23:40:55 +0000 (23:40 +0000)
lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll		patch \| blob \| history
test/Transforms/SLPVectorizer/X86/cse.ll		patch \| blob \| history
test/Transforms/SLPVectorizer/X86/gep.ll		patch \| blob \| history
test/Transforms/SLPVectorizer/X86/loopinvariant.ll		patch \| blob \| history
test/Transforms/SLPVectorizer/X86/pr19657.ll		patch \| blob \| history