From a9baf1ecfd343e2d8d8fa277c8b093a1869726bb Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Sat, 21 Sep 2013 01:06:00 +0000 Subject: [PATCH] Reapply "SLPVectorizer: Handle more horizontal reductions (disabled)"" Reapply r191108 with a fix for a memory corruption error I introduced. Of course, we can't reference the scalars that we replace by vectorizing and then call their eraseFromParent method. I only 'needed' the scalars to get the DebugLoc. Just store the DebugLoc before actually vectorizing instead. As a nice side effect, this also simplifies the interface between BoUpSLP and the HorizontalReduction class to returning a value pointer (the vectorized tree root). radar://14607682 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191123 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 372 +++++++++++++++- .../SLPVectorizer/X86/horizontal.ll | 415 ++++++++++++++++++ 2 files changed, 779 insertions(+), 8 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/horizontal.ll diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index cd3f723cd3e..053e08e4975 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -49,6 +49,11 @@ static cl::opt SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number ")); + +static cl::opt +ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, + cl::desc("Attempt to vectorize horizontal reductions")); + namespace { static const unsigned MinVecRegSize = 128; @@ -238,17 +243,20 @@ public: } /// \brief Vectorize the tree that starts with the elements in \p VL. - void vectorizeTree(); + /// Returns the vectorized root. + Value *vectorizeTree(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots. - void buildTree(ArrayRef Roots); + /// Construct a vectorizable tree that starts at \p Roots and is possibly + /// used by a reduction of \p RdxOps. + void buildTree(ArrayRef Roots, ValueSet *RdxOps = 0); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { + RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -401,6 +409,9 @@ private: /// Numbers instructions in different blocks. DenseMap BlocksNumbers; + /// Reduction operators. + ValueSet *RdxOps; + // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -413,8 +424,9 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef Roots) { +void BoUpSLP::buildTree(ArrayRef Roots, ValueSet *Rdx) { deleteTree(); + RdxOps = Rdx; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -445,8 +457,12 @@ void BoUpSLP::buildTree(ArrayRef Roots) { assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); continue; } + Instruction *UserInst = dyn_cast(*User); + if (!UserInst) + continue; - if (!isa(*User)) + // Ignore uses that are part of the reduction. + if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) continue; DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << @@ -578,6 +594,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { continue; } + // This user is part of the reduction. + if (RdxOps && RdxOps->count(User)) + continue; + // Make sure that we can schedule this unknown user. BlockNumbering &BN = BlocksNumbers[BB]; int UserIndex = BN.getIndex(User); @@ -1372,7 +1392,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return 0; } -void BoUpSLP::vectorizeTree() { +Value *BoUpSLP::vectorizeTree() { Builder.SetInsertPoint(F->getEntryBlock().begin()); vectorizeTree(&VectorizableTree[0]); @@ -1449,7 +1469,10 @@ void BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); assert(!MustGather.count(*User) && "Replacing gathered value with undef"); - assert(ScalarToTreeEntry.count(*User) && + + assert((ScalarToTreeEntry.count(*User) || + // It is legal to replace the reduction users by undef. + (RdxOps && RdxOps->count(*User))) && "Replacing out-of-tree value with undef"); } Value *Undef = UndefValue::get(Ty); @@ -1464,6 +1487,8 @@ void BoUpSLP::vectorizeTree() { BlocksNumbers[it].forget(); } Builder.ClearInsertionPoint(); + + return VectorizableTree[0].VectorizedValue; } void BoUpSLP::optimizeGatherSequence() { @@ -1887,6 +1912,308 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { return 0; } +/// \brief Generate a shuffle mask to be used in a reduction tree. +/// +/// \param VecLen The length of the vector to be reduced. +/// \param NumEltsToRdx The number of elements that should be reduced in the +/// vector. +/// \param IsPairwise Whether the reduction is a pairwise or splitting +/// reduction. A pairwise reduction will generate a mask of +/// <0,2,...> or <1,3,..> while a splitting reduction will generate +/// <2,3, undef,undef> for a vector of 4 and NumElts = 2. +/// \param IsLeft True will generate a mask of even elements, odd otherwise. +static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, + bool IsPairwise, bool IsLeft, + IRBuilder<> &Builder) { + assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); + + SmallVector ShuffleMask( + VecLen, UndefValue::get(Builder.getInt32Ty())); + + if (IsPairwise) + // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). + for (unsigned i = 0; i != NumEltsToRdx; ++i) + ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); + else + // Move the upper half of the vector to the lower half. + for (unsigned i = 0; i != NumEltsToRdx; ++i) + ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); + + return ConstantVector::get(ShuffleMask); +} + + +/// Model horizontal reductions. +/// +/// A horizontal reduction is a tree of reduction operations (currently add and +/// fadd) that has operations that can be put into a vector as its leaf. +/// For example, this tree: +/// +/// mul mul mul mul +/// \ / \ / +/// + + +/// \ / +/// + +/// This tree has "mul" as its reduced values and "+" as its reduction +/// operations. A reduction might be feeding into a store or a binary operation +/// feeding a phi. +/// ... +/// \ / +/// + +/// \ +/// phi += +/// +/// Or: +/// ... +/// \ / +/// + +/// \ +/// *p = +/// +class HorizontalReduction { + SmallPtrSet ReductionOps; + SmallVector ReducedVals; + + BinaryOperator *ReductionRoot; + PHINode *ReductionPHI; + + /// The opcode of the reduction. + unsigned ReductionOpcode; + /// The opcode of the values we perform a reduction on. + unsigned ReducedValueOpcode; + /// The width of one full horizontal reduction operation. + unsigned ReduxWidth; + /// Should we model this reduction as a pairwise reduction tree or a tree that + /// splits the vector in halves and adds those halves. + bool IsPairwiseReduction; + +public: + HorizontalReduction() + : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0), + ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + + /// \brief Try to find a reduction tree. + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, + DataLayout *DL) { + assert((!Phi || + std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && + "Thi phi needs to use the binary operator"); + + // We could have a initial reductions that is not an add. + // r *= v1 + v2 + v3 + v4 + // In such a case start looking for a tree rooted in the first '+'. + if (Phi) { + if (B->getOperand(0) == Phi) { + Phi = 0; + B = dyn_cast(B->getOperand(1)); + } else if (B->getOperand(1) == Phi) { + Phi = 0; + B = dyn_cast(B->getOperand(0)); + } + } + + if (!B) + return false; + + Type *Ty = B->getType(); + if (Ty->isVectorTy()) + return false; + + ReductionOpcode = B->getOpcode(); + ReducedValueOpcode = 0; + ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); + ReductionRoot = B; + ReductionPHI = Phi; + + if (ReduxWidth < 4) + return false; + + // We currently only support adds. + if (ReductionOpcode != Instruction::Add && + ReductionOpcode != Instruction::FAdd) + return false; + + // Post order traverse the reduction tree starting at B. We only handle true + // trees containing only binary operators. + SmallVector, 32> Stack; + Stack.push_back(std::make_pair(B, 0)); + while (!Stack.empty()) { + BinaryOperator *TreeN = Stack.back().first; + unsigned EdgeToVist = Stack.back().second++; + bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; + + // Only handle trees in the current basic block. + if (TreeN->getParent() != B->getParent()) + return false; + + // Each tree node needs to have one user except for the ultimate + // reduction. + if (!TreeN->hasOneUse() && TreeN != B) + return false; + + // Postorder vist. + if (EdgeToVist == 2 || IsReducedValue) { + if (IsReducedValue) { + // Make sure that the opcodes of the operations that we are going to + // reduce match. + if (!ReducedValueOpcode) + ReducedValueOpcode = TreeN->getOpcode(); + else if (ReducedValueOpcode != TreeN->getOpcode()) + return false; + ReducedVals.push_back(TreeN); + } else { + // We need to be able to reassociate the adds. + if (!TreeN->isAssociative()) + return false; + ReductionOps.insert(TreeN); + } + // Retract. + Stack.pop_back(); + continue; + } + + // Visit left or right. + Value *NextV = TreeN->getOperand(EdgeToVist); + BinaryOperator *Next = dyn_cast(NextV); + if (Next) + Stack.push_back(std::make_pair(Next, 0)); + else if (NextV != Phi) + return false; + } + return true; + } + + /// \brief Attempt to vectorize the tree found by + /// matchAssociativeReduction. + bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + if (ReducedVals.empty()) + return false; + + unsigned NumReducedVals = ReducedVals.size(); + if (NumReducedVals < ReduxWidth) + return false; + + Value *VectorizedTree = 0; + IRBuilder<> Builder(ReductionRoot); + FastMathFlags Unsafe; + Unsafe.setUnsafeAlgebra(); + Builder.SetFastMathFlags(Unsafe); + unsigned i = 0; + + for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { + ArrayRef ValsToReduce(&ReducedVals[i], ReduxWidth); + V.buildTree(ValsToReduce, &ReductionOps); + + // Estimate cost. + int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); + if (Cost >= -SLPCostThreshold) + break; + + DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost + << ". (HorRdx)\n"); + + // Vectorize a tree. + DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc(); + Value *VectorizedRoot = V.vectorizeTree(); + + // Emit a reduction. + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); + if (VectorizedTree) { + Builder.SetCurrentDebugLocation(Loc); + VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, + ReducedSubTree, "bin.rdx"); + } else + VectorizedTree = ReducedSubTree; + } + + if (VectorizedTree) { + // Finish the reduction. + for (; i < NumReducedVals; ++i) { + Builder.SetCurrentDebugLocation( + cast(ReducedVals[i])->getDebugLoc()); + VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, + ReducedVals[i]); + } + // Update users. + if (ReductionPHI) { + assert(ReductionRoot != NULL && "Need a reduction operation"); + ReductionRoot->setOperand(0, VectorizedTree); + ReductionRoot->setOperand(1, ReductionPHI); + } else + ReductionRoot->replaceAllUsesWith(VectorizedTree); + } + return VectorizedTree != 0; + } + +private: + + /// \brief Calcuate the cost of a reduction. + int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { + Type *ScalarTy = FirstReducedVal->getType(); + Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); + + int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); + int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); + + IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; + int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; + + int ScalarReduxCost = + ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); + + DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost + << " for reduction that starts with " << *FirstReducedVal + << " (It is a " + << (IsPairwiseReduction ? "pairwise" : "splitting") + << " reduction)\n"); + + return VecReduxCost - ScalarReduxCost; + } + + static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, + Value *R, const Twine &Name = "") { + if (Opcode == Instruction::FAdd) + return Builder.CreateFAdd(L, R, Name); + return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); + } + + /// \brief Emit a horizontal reduction of the vectorized value. + Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { + assert(VectorizedValue && "Need to have a vectorized tree node"); + Instruction *ValToReduce = dyn_cast(VectorizedValue); + assert(isPowerOf2_32(ReduxWidth) && + "We only handle power-of-two reductions for now"); + + SmallVector ShuffleMask(ReduxWidth, 0); + Value *TmpVec = ValToReduce; + for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { + if (IsPairwiseReduction) { + Value *LeftMask = + createRdxShuffleMask(ReduxWidth, i, true, true, Builder); + Value *RightMask = + createRdxShuffleMask(ReduxWidth, i, true, false, Builder); + + Value *LeftShuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); + Value *RightShuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), + "rdx.shuf.r"); + TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, + "bin.rdx"); + } else { + Value *UpperHalf = + createRdxShuffleMask(ReduxWidth, i, false, false, Builder); + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); + TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); + } + } + + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + } +}; + /// \brief Recognize construction of vectors like /// %ra = insertelement <4 x float> undef, float %s0, i32 0 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 @@ -1981,7 +2308,18 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!BI) continue; - Value *Inst = BI->getOperand(0); + // Try to match and vectorize a horizontal reduction. + HorizontalReduction HorRdx; + if (ShouldVectorizeHor && + HorRdx.matchAssociativeReduction(P, BI, DL) && + HorRdx.tryToReduce(R, TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + + Value *Inst = BI->getOperand(0); if (Inst == P) Inst = BI->getOperand(1); @@ -1991,10 +2329,28 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { Changed = true; it = BB->begin(); e = BB->end(); + continue; } + continue; } + // Try to vectorize horizontal reductions feeding into a store. + if (StoreInst *SI = dyn_cast(it)) + if (BinaryOperator *BinOp = + dyn_cast(SI->getValueOperand())) { + HorizontalReduction HorRdx; + if (ShouldVectorizeHor && + ((HorRdx.matchAssociativeReduction(0, BinOp, DL) && + HorRdx.tryToReduce(R, TTI)) || + tryToVectorize(BinOp, R))) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast(it)) { if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll new file mode 100644 index 00000000000..9517066ed2e --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -0,0 +1,415 @@ +; RUN: opt -slp-vectorizer -slp-vectorize-hor -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; #include +; +; int foo(float *A, int n) { +; float sum = 0; +; for (intptr_t i=0; i < n; ++i) { +; sum += 7*A[i*4 ] + +; 7*A[i*4+1] + +; 7*A[i*4+2] + +; 7*A[i*4+3]; +; } +; return sum; +; } + +; CHECK-LABEL: add_red +; CHECK: fmul <4 x float> +; CHECK: shufflevector <4 x float> + +define i32 @add_red(float* %A, i32 %n) { +entry: + %cmp31 = icmp sgt i32 %n, 0 + br i1 %cmp31, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ] + %mul = shl nsw i64 %i.033, 2 + %arrayidx = getelementptr inbounds float* %A, i64 %mul + %1 = load float* %arrayidx, align 4 + %mul2 = fmul float %1, 7.000000e+00 + %add28 = or i64 %mul, 1 + %arrayidx4 = getelementptr inbounds float* %A, i64 %add28 + %2 = load float* %arrayidx4, align 4 + %mul5 = fmul float %2, 7.000000e+00 + %add6 = fadd fast float %mul2, %mul5 + %add829 = or i64 %mul, 2 + %arrayidx9 = getelementptr inbounds float* %A, i64 %add829 + %3 = load float* %arrayidx9, align 4 + %mul10 = fmul float %3, 7.000000e+00 + %add11 = fadd fast float %add6, %mul10 + %add1330 = or i64 %mul, 3 + %arrayidx14 = getelementptr inbounds float* %A, i64 %add1330 + %4 = load float* %arrayidx14, align 4 + %mul15 = fmul float %4, 7.000000e+00 + %add16 = fadd fast float %add11, %mul15 + %add17 = fadd fast float %sum.032, %add16 + %inc = add nsw i64 %i.033, 1 + %exitcond = icmp eq i64 %inc, %0 + br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + %phitmp = fptosi float %add17 to i32 + br label %for.end + +for.end: + %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} + +; int foo(float * restrict A, float * restrict B, int n) { +; float sum = 0; +; for (intptr_t i=0; i < n; ++i) { +; sum *= B[0]*A[i*4 ] + +; B[1]*A[i*4+1] + +; B[2]*A[i*4+2] + +; B[3]*A[i*4+3]; +; } +; return sum; +; } + +; CHECK-LABEL: mul_red +; CHECK: fmul <4 x float> +; CHECK: shufflevector <4 x float> + +define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) { +entry: + %cmp38 = icmp sgt i32 %n, 0 + br i1 %cmp38, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = load float* %B, align 4 + %arrayidx4 = getelementptr inbounds float* %B, i64 1 + %1 = load float* %arrayidx4, align 4 + %arrayidx9 = getelementptr inbounds float* %B, i64 2 + %2 = load float* %arrayidx9, align 4 + %arrayidx15 = getelementptr inbounds float* %B, i64 3 + %3 = load float* %arrayidx15, align 4 + %4 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ] + %mul = shl nsw i64 %i.040, 2 + %arrayidx2 = getelementptr inbounds float* %A, i64 %mul + %5 = load float* %arrayidx2, align 4 + %mul3 = fmul float %0, %5 + %add35 = or i64 %mul, 1 + %arrayidx6 = getelementptr inbounds float* %A, i64 %add35 + %6 = load float* %arrayidx6, align 4 + %mul7 = fmul float %1, %6 + %add8 = fadd fast float %mul3, %mul7 + %add1136 = or i64 %mul, 2 + %arrayidx12 = getelementptr inbounds float* %A, i64 %add1136 + %7 = load float* %arrayidx12, align 4 + %mul13 = fmul float %2, %7 + %add14 = fadd fast float %add8, %mul13 + %add1737 = or i64 %mul, 3 + %arrayidx18 = getelementptr inbounds float* %A, i64 %add1737 + %8 = load float* %arrayidx18, align 4 + %mul19 = fmul float %3, %8 + %add20 = fadd fast float %add14, %mul19 + %mul21 = fmul float %sum.039, %add20 + %inc = add nsw i64 %i.040, 1 + %exitcond = icmp eq i64 %inc, %4 + br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + %phitmp = fptosi float %mul21 to i32 + br label %for.end + +for.end: + %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} + +; int foo(float * restrict A, float * restrict B, int n) { +; float sum = 0; +; for (intptr_t i=0; i < n; ++i) { +; sum += B[0]*A[i*6 ] + +; B[1]*A[i*6+1] + +; B[2]*A[i*6+2] + +; B[3]*A[i*6+3] + +; B[4]*A[i*6+4] + +; B[5]*A[i*6+5] + +; B[6]*A[i*6+6] + +; B[7]*A[i*6+7] + +; B[8]*A[i*6+8]; +; } +; return sum; +; } + +; CHECK-LABEL: long_red +; CHECK: fmul <4 x float> +; CHECK: shufflevector <4 x float> + +define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { +entry: + %cmp81 = icmp sgt i32 %n, 0 + br i1 %cmp81, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = load float* %B, align 4 + %arrayidx4 = getelementptr inbounds float* %B, i64 1 + %1 = load float* %arrayidx4, align 4 + %arrayidx9 = getelementptr inbounds float* %B, i64 2 + %2 = load float* %arrayidx9, align 4 + %arrayidx15 = getelementptr inbounds float* %B, i64 3 + %3 = load float* %arrayidx15, align 4 + %arrayidx21 = getelementptr inbounds float* %B, i64 4 + %4 = load float* %arrayidx21, align 4 + %arrayidx27 = getelementptr inbounds float* %B, i64 5 + %5 = load float* %arrayidx27, align 4 + %arrayidx33 = getelementptr inbounds float* %B, i64 6 + %6 = load float* %arrayidx33, align 4 + %arrayidx39 = getelementptr inbounds float* %B, i64 7 + %7 = load float* %arrayidx39, align 4 + %arrayidx45 = getelementptr inbounds float* %B, i64 8 + %8 = load float* %arrayidx45, align 4 + %9 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ] + %mul = mul nsw i64 %i.083, 6 + %arrayidx2 = getelementptr inbounds float* %A, i64 %mul + %10 = load float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %10 + %add80 = or i64 %mul, 1 + %arrayidx6 = getelementptr inbounds float* %A, i64 %add80 + %11 = load float* %arrayidx6, align 4 + %mul7 = fmul fast float %1, %11 + %add8 = fadd fast float %mul3, %mul7 + %add11 = add nsw i64 %mul, 2 + %arrayidx12 = getelementptr inbounds float* %A, i64 %add11 + %12 = load float* %arrayidx12, align 4 + %mul13 = fmul fast float %2, %12 + %add14 = fadd fast float %add8, %mul13 + %add17 = add nsw i64 %mul, 3 + %arrayidx18 = getelementptr inbounds float* %A, i64 %add17 + %13 = load float* %arrayidx18, align 4 + %mul19 = fmul fast float %3, %13 + %add20 = fadd fast float %add14, %mul19 + %add23 = add nsw i64 %mul, 4 + %arrayidx24 = getelementptr inbounds float* %A, i64 %add23 + %14 = load float* %arrayidx24, align 4 + %mul25 = fmul fast float %4, %14 + %add26 = fadd fast float %add20, %mul25 + %add29 = add nsw i64 %mul, 5 + %arrayidx30 = getelementptr inbounds float* %A, i64 %add29 + %15 = load float* %arrayidx30, align 4 + %mul31 = fmul fast float %5, %15 + %add32 = fadd fast float %add26, %mul31 + %add35 = add nsw i64 %mul, 6 + %arrayidx36 = getelementptr inbounds float* %A, i64 %add35 + %16 = load float* %arrayidx36, align 4 + %mul37 = fmul fast float %6, %16 + %add38 = fadd fast float %add32, %mul37 + %add41 = add nsw i64 %mul, 7 + %arrayidx42 = getelementptr inbounds float* %A, i64 %add41 + %17 = load float* %arrayidx42, align 4 + %mul43 = fmul fast float %7, %17 + %add44 = fadd fast float %add38, %mul43 + %add47 = add nsw i64 %mul, 8 + %arrayidx48 = getelementptr inbounds float* %A, i64 %add47 + %18 = load float* %arrayidx48, align 4 + %mul49 = fmul fast float %8, %18 + %add50 = fadd fast float %add44, %mul49 + %add51 = fadd fast float %sum.082, %add50 + %inc = add nsw i64 %i.083, 1 + %exitcond = icmp eq i64 %inc, %9 + br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + %phitmp = fptosi float %add51 to i32 + br label %for.end + +for.end: + %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} + +; int foo(float * restrict A, float * restrict B, int n) { +; float sum = 0; +; for (intptr_t i=0; i < n; ++i) { +; sum += B[0]*A[i*4 ]; +; sum += B[1]*A[i*4+1]; +; sum += B[2]*A[i*4+2]; +; sum += B[3]*A[i*4+3]; +; } +; return sum; +; } + +; CHECK-LABEL: chain_red +; CHECK: fmul <4 x float> +; CHECK: shufflevector <4 x float> + +define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { +entry: + %cmp41 = icmp sgt i32 %n, 0 + br i1 %cmp41, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = load float* %B, align 4 + %arrayidx4 = getelementptr inbounds float* %B, i64 1 + %1 = load float* %arrayidx4, align 4 + %arrayidx10 = getelementptr inbounds float* %B, i64 2 + %2 = load float* %arrayidx10, align 4 + %arrayidx16 = getelementptr inbounds float* %B, i64 3 + %3 = load float* %arrayidx16, align 4 + %4 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ] + %mul = shl nsw i64 %i.043, 2 + %arrayidx2 = getelementptr inbounds float* %A, i64 %mul + %5 = load float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %5 + %add = fadd fast float %sum.042, %mul3 + %add638 = or i64 %mul, 1 + %arrayidx7 = getelementptr inbounds float* %A, i64 %add638 + %6 = load float* %arrayidx7, align 4 + %mul8 = fmul fast float %1, %6 + %add9 = fadd fast float %add, %mul8 + %add1239 = or i64 %mul, 2 + %arrayidx13 = getelementptr inbounds float* %A, i64 %add1239 + %7 = load float* %arrayidx13, align 4 + %mul14 = fmul fast float %2, %7 + %add15 = fadd fast float %add9, %mul14 + %add1840 = or i64 %mul, 3 + %arrayidx19 = getelementptr inbounds float* %A, i64 %add1840 + %8 = load float* %arrayidx19, align 4 + %mul20 = fmul fast float %3, %8 + %add21 = fadd fast float %add15, %mul20 + %inc = add nsw i64 %i.043, 1 + %exitcond = icmp eq i64 %inc, %4 + br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + %phitmp = fptosi float %add21 to i32 + br label %for.end + +for.end: + %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} + +; int foo(float * restrict A, float * restrict B, float * restrict C, int n) { +; float sum = 0; +; for (intptr_t i=0; i < n; ++i) { +; C[i] = B[0] *A[i*4 ] + +; B[1] *A[i*4+1] + +; B[2] *A[i*4+2] + +; B[3] *A[i*4+3]; +; } +; return sum; +; } + +; CHECK-LABEL: store_red +; CHECK: fmul <4 x float> +; CHECK: shufflevector <4 x float> + +define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { +entry: + %cmp37 = icmp sgt i32 %n, 0 + br i1 %cmp37, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %arrayidx4 = getelementptr inbounds float* %B, i64 1 + %arrayidx9 = getelementptr inbounds float* %B, i64 2 + %arrayidx15 = getelementptr inbounds float* %B, i64 3 + %0 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] + %1 = load float* %B, align 4 + %mul = shl nsw i64 %i.039, 2 + %arrayidx2 = getelementptr inbounds float* %A, i64 %mul + %2 = load float* %arrayidx2, align 4 + %mul3 = fmul fast float %1, %2 + %3 = load float* %arrayidx4, align 4 + %add34 = or i64 %mul, 1 + %arrayidx6 = getelementptr inbounds float* %A, i64 %add34 + %4 = load float* %arrayidx6, align 4 + %mul7 = fmul fast float %3, %4 + %add8 = fadd fast float %mul3, %mul7 + %5 = load float* %arrayidx9, align 4 + %add1135 = or i64 %mul, 2 + %arrayidx12 = getelementptr inbounds float* %A, i64 %add1135 + %6 = load float* %arrayidx12, align 4 + %mul13 = fmul fast float %5, %6 + %add14 = fadd fast float %add8, %mul13 + %7 = load float* %arrayidx15, align 4 + %add1736 = or i64 %mul, 3 + %arrayidx18 = getelementptr inbounds float* %A, i64 %add1736 + %8 = load float* %arrayidx18, align 4 + %mul19 = fmul fast float %7, %8 + %add20 = fadd fast float %add14, %mul19 + store float %add20, float* %C.addr.038, align 4 + %incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1 + %inc = add nsw i64 %i.039, 1 + %exitcond = icmp eq i64 %inc, %0 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 0 +} + + +; void foo(double * restrict A, double * restrict B, double * restrict C, +; int n) { +; for (intptr_t i=0; i < n; ++i) { +; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1]; +; } +; } + +; CHECK-LABEL: store_red_double +; CHECK: fmul <2 x double> +; CHECK: extractelement <2 x double> +; CHECK: extractelement <2 x double> + +define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) { +entry: + %cmp17 = icmp sgt i32 %n, 0 + br i1 %cmp17, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %0 = load double* %B, align 8 + %arrayidx4 = getelementptr inbounds double* %B, i64 1 + %1 = load double* %arrayidx4, align 8 + %2 = sext i32 %n to i64 + br label %for.body + +for.body: + %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %mul = shl nsw i64 %i.018, 2 + %arrayidx2 = getelementptr inbounds double* %A, i64 %mul + %3 = load double* %arrayidx2, align 8 + %mul3 = fmul fast double %0, %3 + %add16 = or i64 %mul, 1 + %arrayidx6 = getelementptr inbounds double* %A, i64 %add16 + %4 = load double* %arrayidx6, align 8 + %mul7 = fmul fast double %1, %4 + %add8 = fadd fast double %mul3, %mul7 + %arrayidx9 = getelementptr inbounds double* %C, i64 %i.018 + store double %add8, double* %arrayidx9, align 8 + %inc = add nsw i64 %i.018, 1 + %exitcond = icmp eq i64 %inc, %2 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} -- 2.34.1