[LV] Don't bail to MiddleBlock if a runtime check fails, bail to ScalarPH instead

author James Molloy <james.molloy@arm.com>

Wed, 2 Sep 2015 10:15:39 +0000 (10:15 +0000)

committer James Molloy <james.molloy@arm.com>

Wed, 2 Sep 2015 10:15:39 +0000 (10:15 +0000)
author James Molloy <james.molloy@arm.com>
Wed, 2 Sep 2015 10:15:39 +0000 (10:15 +0000)
committer James Molloy <james.molloy@arm.com>
Wed, 2 Sep 2015 10:15:39 +0000 (10:15 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 0975b03da173217ee7ac4a417c3d9df60aa67a5a..dd9edf442a58c59102cedfc0414dcef7743554ce 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2785,13 +2785,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
    |  /  |
    | /   v
    ||   [ ]     <-- vector pre header.
-  ||    |
-  ||    v
-  ||   [  ] \
-  ||   [  ]_|   <-- vector loop.
-  ||    |
-  | \   v
-  |   >[ ]   <--- middle-block.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
    |  /  |
    | /   v
    -|- >[ ]     <--- new preheader.
@@ -2860,16 +2860,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
    emitMinimumIterationCountCheck(Lp, ScalarPH);
    // Now, compare the new count to zero. If it is zero skip the vector loop and
    // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, MiddleBlock);
+  emitVectorLoopEnteredCheck(Lp, ScalarPH);
    // Generate the code to check that the strides we assumed to be one are really
    // one. We want the new basic block to start at the first instruction in a
    // sequence of instructions that form a check.
-  emitStrideChecks(Lp, MiddleBlock);
+  emitStrideChecks(Lp, ScalarPH);
    // Generate the code that checks in runtime if arrays overlap. We put the
    // checks into a separate block to make the more common case of few elements
    // faster.
-  emitMemRuntimeChecks(Lp, MiddleBlock);
-
+  emitMemRuntimeChecks(Lp, ScalarPH);
+  
    // Generate the induction variable.
    // The loop step is equal to the vectorization factor (num of SIMD elements)
    // times the unroll factor (num of SIMD instructions).
@@ -2890,27 +2890,20 @@ void InnerLoopVectorizer::createEmptyLoop() {
    // This variable saves the new starting index for the scalar loop. It is used
    // to test if there are any tail iterations left once the vector loop has
    // completed.
-  PHINode *ResumeIndex = nullptr;
    LoopVectorizationLegality::InductionList::iterator I, E;
    LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
    for (I = List->begin(), E = List->end(); I != E; ++I) {
      PHINode *OrigPhi = I->first;
      InductionDescriptor II = I->second;
  
-    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
-                                         MiddleBlock->getTerminator());
      // Create phi nodes to merge from the  backedge-taken check block.
      PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
                                             "bc.resume.val",
                                             ScalarPH->getTerminator());
-    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
-
      Value *EndValue;
      if (OrigPhi == OldInduction) {
        // We know what the end value is.
        EndValue = CountRoundDown;
-      // We also know which PHI node holds it.
-      ResumeIndex = ResumeVal;
      } else {
        IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
        Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
@@ -2922,41 +2915,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
  
      // The new PHI merges the original incoming value, in case of a bypass,
      // or the value at the end of the vectorized loop.
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
-    ResumeVal->addIncoming(EndValue, VecBody);
+    BCResumeVal->addIncoming(EndValue, MiddleBlock);
  
      // Fix the scalar body counter (PHI node).
      unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
  
      // The old induction's phi node in the scalar body needs the truncated
      // value.
-    BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
    }
  
-  // If we are generating a new induction variable then we also need to
-  // generate the code that calculates the exit value. This value is not
-  // simply the end of the counter because we may skip the vectorized body
-  // in case of a runtime check.
-  if (!OldInduction){
-    assert(!ResumeIndex && "Unexpected resume value found");
-    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
-                                  MiddleBlock->getTerminator());
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
-    ResumeIndex->addIncoming(CountRoundDown, VecBody);
-  }
-
-  // Make sure that we found the index where scalar loop needs to continue.
-  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
-         "Invalid resume Index");
-
    // Add a check in the middle block to see if we have completed
    // all of the iterations in the first vector loop.
    // If (N - N%VF) == N, then we *don't* need to run the remainder.
    Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                ResumeIndex, "cmp.n",
+                                CountRoundDown, "cmp.n",
                                  MiddleBlock->getTerminator());
    ReplaceInstWithInst(MiddleBlock->getTerminator(),
                        BranchInst::Create(ExitBlock, ScalarPH, CmpN));
@@ -3262,19 +3237,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
      // instructions.
      Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
  
-    VectorParts RdxParts, &RdxExitVal = getVectorValue(LoopExitInst);
+    VectorParts RdxParts = getVectorValue(LoopExitInst);
      setDebugLocFromInst(Builder, LoopExitInst);
-    for (unsigned part = 0; part < UF; ++part) {
-      // This PHINode contains the vectorized reduction variable, or
-      // the initial value vector, if we bypass the vector loop.
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
-      NewPhi->addIncoming(RdxExitVal[part],
-                          LoopVectorBody.back());
-      RdxParts.push_back(NewPhi);
-    }
  
      // If the vector reduction can be performed in a smaller type, we truncate
      // then extend the loop exit value to enable InstCombine to evaluate the
@@ -3283,15 +3247,17 @@ void InnerLoopVectorizer::vectorizeLoop() {
        Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
        Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
        for (unsigned part = 0; part < UF; ++part) {
-        Value *Trunc = Builder.CreateTrunc(RdxExitVal[part], RdxVecTy);
+        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
          Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
                                            : Builder.CreateZExt(Trunc, VecTy);
-        for (Value::user_iterator UI = RdxExitVal[part]->user_begin();
-             UI != RdxExitVal[part]->user_end();)
-          if (*UI != Trunc)
-            (*UI++)->replaceUsesOfWith(RdxExitVal[part], Extnd);
-          else
+        for (Value::user_iterator UI = RdxParts[part]->user_begin();
+             UI != RdxParts[part]->user_end();)
+          if (*UI != Trunc) {
+            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+            RdxParts[part] = Extnd;
+          } else {
              ++UI;
+          }
        }
        Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
        for (unsigned part = 0; part < UF; ++part)
@@ -3362,7 +3328,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
      // block and the middle block.
      PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
                                            LoopScalarPreHeader->getTerminator());
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
      BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
  
      // Now, we need to fix the users of the reduction variable
@@ -3850,7 +3817,7 @@ void InnerLoopVectorizer::updateAnalysis() {
      }
    }
  
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
    DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
    DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
    DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll

index d470a7a0846c8b9d0b0c45fb1eb652b3a428ee59..c5277c36b3f9bbbd1d67d0f6befb76ee8e9fa70b 100644 (file)
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  ; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
  ; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
  ; CHECK: middle.block
-; CHECK:   add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg ![[LOC2]]
+; CHECK:   add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[LOC2]]
  ; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC2]]
  
  define i32 @f(i32* nocapture %a, i32 %size) #0 {
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll

index f6072add3d1cd360d94c4748586419dd5a0ab2fa..59ee66a4a35df236bfdb765f119516482dc229dd 100644 (file)
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -115,8 +115,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
  ; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked
  
  ; CHECK: scalar.ph:
-; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ]
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ]
  
  define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
  
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll

index 647e58a7e41f5b34bba05a00e85ba0e9e00bd54e..63b138f1d56010bf16cc0fd2d0bb8bbe76cc44fe 100644 (file)
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -175,8 +175,8 @@ for.end:                                          ; preds = %for.body, %entry
  }
  
  ;CHECK-LABEL: @reduction_and(
-;CHECK: and <4 x i32>
  ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: and <4 x i32>
  ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  ;CHECK: and <4 x i32>
  ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll

index 9066fb5ff0dc202285db2e1cf12bc49b349ac9b0..3673b71db30d5bc5879950feb39a5385ddc994e2 100644 (file)
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -11,9 +11,9 @@ target triple = "x86_64-apple-macosx10.9.0"
  
  ;CHECK-LABEL: define i32 @foo
  ;CHECK: for.body.preheader:
-;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
+;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
  ;CHECK: vector.memcheck:
-;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph, !dbg [[BODY_LOC]]
+;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]]
  ;CHECK: load <4 x float>
  define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
  entry:
author	James Molloy <james.molloy@arm.com>
	Wed, 2 Sep 2015 10:15:39 +0000 (10:15 +0000)
committer	James Molloy <james.molloy@arm.com>
	Wed, 2 Sep 2015 10:15:39 +0000 (10:15 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/debugloc.ll		patch \| blob \| history
test/Transforms/LoopVectorize/induction.ll		patch \| blob \| history
test/Transforms/LoopVectorize/reduction.ll		patch \| blob \| history
test/Transforms/LoopVectorize/runtime-check.ll		patch \| blob \| history