Add support for floating-point minnum and maxnum

[oota-llvm.git] / lib / CodeGen / SelectionDAG / DAGCombiner.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index e05714342d211301486150475234ec9cd29d4eb8..2c87d120cb7a50703a1d3894cf6eefeebc9a24c3 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -338,6 +338,7 @@ namespace {
                                           unsigned HiOp);
      SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
      SDValue CombineExtLoad(SDNode *N);
+    SDValue combineRepeatedFPDivisors(SDNode *N);
      SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
      SDValue BuildSDIV(SDNode *N);
      SDValue BuildSDIVPow2(SDNode *N);
@@ -409,7 +410,7 @@ namespace {
      void getStoreMergeAndAliasCandidates(
          StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
          SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
-    
+
      /// Merge consecutive store operations into a wide store.
      /// This optimization uses wide integers or vectors when possible.
      /// \return True if some memory operations were changed.
@@ -427,9 +428,7 @@ namespace {
      DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
          : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
            OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
-      auto *F = DAG.getMachineFunction().getFunction();
-      ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) ||
-                    F->hasFnAttribute(Attribute::MinSize);
+      ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
      }
  
      /// Runs the dag combiner on all nodes in the work list
@@ -1216,9 +1215,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
    LegalTypes = Level >= AfterLegalizeTypes;
  
    // Add all the dag nodes to the worklist.
-  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = DAG.allnodes_end(); I != E; ++I)
-    AddToWorklist(I);
+  for (SDNode &Node : DAG.allnodes())
+    AddToWorklist(&Node);
  
    // Create a dummy node (which is not added to allnodes), that adds a reference
    // to the root node, preventing it from being deleted, and tracking any
@@ -2192,8 +2190,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
        return SDValue();
  
      // Target-specific implementation of sdiv x, pow2.
-    SDValue Res = BuildSDIVPow2(N);
-    if (Res.getNode())
+    if (SDValue Res = BuildSDIVPow2(N))
        return Res;
  
      unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
@@ -2229,10 +2226,9 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
  
    // If integer divide is expensive and we satisfy the requirements, emit an
    // alternate sequence.
-  if (N1C && !TLI.isIntDivCheap()) {
-    SDValue Op = BuildSDIV(N);
-    if (Op.getNode()) return Op;
-  }
+  if (N1C && !TLI.isIntDivCheap())
+    if (SDValue Op = BuildSDIV(N))
+      return Op;
  
    // undef / X -> 0
    if (N0.getOpcode() == ISD::UNDEF)
@@ -2285,10 +2281,9 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
      }
    }
    // fold (udiv x, c) -> alternate
-  if (N1C && !TLI.isIntDivCheap()) {
-    SDValue Op = BuildUDIV(N);
-    if (Op.getNode()) return Op;
-  }
+  if (N1C && !TLI.isIntDivCheap())
+    if (SDValue Op = BuildUDIV(N))
+      return Op;
  
    // undef / X -> 0
    if (N0.getOpcode() == ISD::UNDEF)
@@ -2532,8 +2527,8 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
  }
  
  SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
+    return Res;
  
    EVT VT = N->getValueType(0);
    SDLoc DL(N);
@@ -2563,8 +2558,8 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
  }
  
  SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
+    return Res;
  
    EVT VT = N->getValueType(0);
    SDLoc DL(N);
@@ -2614,15 +2609,15 @@ SDValue DAGCombiner::visitUMULO(SDNode *N) {
  }
  
  SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM))
+    return Res;
  
    return SDValue();
  }
  
  SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM))
+    return Res;
  
    return SDValue();
  }
@@ -3142,10 +3137,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
      return Combined;
  
    // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
  
    // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
    // fold (and (sra)) -> (and (srl)) when possible.
@@ -3665,11 +3659,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
      return Combined;
  
    // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
-  SDValue BSwap = MatchBSwapHWord(N, N0, N1);
-  if (BSwap.getNode())
+  if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
      return BSwap;
-  BSwap = MatchBSwapHWordLow(N, N0, N1);
-  if (BSwap.getNode())
+  if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
      return BSwap;
  
    // reassociate or
@@ -3690,10 +3682,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
      }
    }
    // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
  
    // See if this is some rotate idiom.
    if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
@@ -4112,10 +4103,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
    }
  
    // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
  
    // Simplify the expression using non-local knowledge.
    if (!VT.isVector() &&
@@ -4434,11 +4424,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
      return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
    }
  
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSHL = visitShiftByConstant(N, N1C);
-    if (NewSHL.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSHL = visitShiftByConstant(N, N1C))
        return NewSHL;
-  }
  
    return SDValue();
  }
@@ -4583,11 +4571,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
    if (DAG.SignBitIsZero(N0))
      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
  
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSRA = visitShiftByConstant(N, N1C);
-    if (NewSRA.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRA = visitShiftByConstant(N, N1C))
        return NewSRA;
-  }
  
    return SDValue();
  }
@@ -4744,8 +4730,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
    // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
    if (N1.getOpcode() == ISD::TRUNCATE &&
        N1.getOperand(0).getOpcode() == ISD::AND) {
-    SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode());
-    if (NewOp1.getNode())
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
        return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
    }
  
@@ -4754,15 +4739,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
    if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
      return SDValue(N, 0);
  
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSRL = visitShiftByConstant(N, N1C);
-    if (NewSRL.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRL = visitShiftByConstant(N, N1C))
        return NewSRL;
-  }
  
    // Attempt to convert a srl of a load into a narrower zero-extending load.
-  SDValue NarrowLoad = ReduceLoadWidth(N);
-  if (NarrowLoad.getNode())
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
      return NarrowLoad;
  
    // Here is a common situation. We want to optimize:
@@ -5523,8 +5505,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
    if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
        N2.getOpcode() == ISD::CONCAT_VECTORS &&
        ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
-    SDValue CV = ConvertSelectToConcatVector(N, DAG);
-    if (CV.getNode())
+    if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
        return CV;
    }
  
@@ -5580,7 +5561,7 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
                         SDLoc(N));
  }
  
-/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or 
+/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
  /// a build_vector of constants.
  /// This function is called by the DAGCombiner when visiting sext/zext/aext
  /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
@@ -5837,8 +5818,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
    if (N0.getOpcode() == ISD::TRUNCATE) {
      // fold (sext (truncate (load x))) -> (sext (smaller load x))
      // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6120,8 +6100,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
    // fold (zext (truncate (load x))) -> (zext (smaller load x))
    // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
    if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6138,8 +6117,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
  
      // fold (zext (truncate (load x))) -> (zext (smaller load x))
      // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6378,8 +6356,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
    // fold (aext (truncate (load x))) -> (aext (smaller load x))
    // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
    if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6546,8 +6523,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
        // Watch out for shift count overflow though.
        if (Amt >= Mask.getBitWidth()) break;
        APInt NewMask = Mask << Amt;
-      SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
-      if (SimplifyLHS.getNode())
+      if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
          return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                             SimplifyLHS, V.getOperand(1));
      }
@@ -6771,8 +6747,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
  
    // fold (sext_in_reg (load x)) -> (smaller sextload x)
    // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
-  SDValue NarrowLoad = ReduceLoadWidth(N);
-  if (NarrowLoad.getNode())
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
      return NarrowLoad;
  
    // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
@@ -6999,9 +6974,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
    // fold (truncate (load x)) -> (smaller load x)
    // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
    if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
-    SDValue Reduced = ReduceLoadWidth(N);
-    if (Reduced.getNode())
+    if (SDValue Reduced = ReduceLoadWidth(N))
        return Reduced;
+
      // Handle the case where the load remains an extending load even
      // after truncation.
      if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
@@ -7240,11 +7215,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
    }
  
    // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
-  if (N0.getOpcode() == ISD::BUILD_PAIR) {
-    SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT);
-    if (CombineLD.getNode())
+  if (N0.getOpcode() == ISD::BUILD_PAIR)
+    if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
        return CombineLD;
-  }
  
    // Remove double bitcasts from shuffles - this is often a legacy of
    // XformToShuffleWithZero being used to combine bitmaskings (of
@@ -7257,10 +7230,10 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
      ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
  
      // If operands are a bitcast, peek through if it casts the original VT.
-    // If operands are a UNDEF or constant, just bitcast back to original VT.
+    // If operands are a constant, just bitcast back to original VT.
      auto PeekThroughBitcast = [&](SDValue Op) {
        if (Op.getOpcode() == ISD::BITCAST &&
-          Op.getOperand(0)->getValueType(0) == VT)
+          Op.getOperand(0).getValueType() == VT)
          return SDValue(Op.getOperand(0));
        if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
@@ -8003,8 +7976,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
    } // enable-unsafe-fp-math
  
    // FADD -> FMA combines:
-  SDValue Fused = visitFADDForFMACombine(N);
-  if (Fused) {
+  if (SDValue Fused = visitFADDForFMACombine(N)) {
      AddToWorklist(Fused.getNode());
      return Fused;
    }
@@ -8068,8 +8040,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
    }
  
    // FSUB -> FMA combines:
-  SDValue Fused = visitFSUBForFMACombine(N);
-  if (Fused) {
+  if (SDValue Fused = visitFSUBForFMACombine(N)) {
      AddToWorklist(Fused.getNode());
      return Fused;
    }
@@ -8139,7 +8110,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
      // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
      // during an early run of DAGCombiner can prevent folding with fmuls
      // inserted during lowering.
-    if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) {
+    if (N0.getOpcode() == ISD::FADD &&
+        (N0.getOperand(0) == N0.getOperand(1)) &&
+        N0.hasOneUse()) {
        const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
        SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
        return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
@@ -8253,6 +8226,66 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
    return SDValue();
  }
  
+// Combine multiple FDIVs with the same divisor into multiple FMULs by the
+// reciprocal.
+// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+// Notice that this is not always beneficial. One reason is different target
+// may have different costs for FDIV and FMUL, so sometimes the cost of two
+// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // Skip if current node is a reciprocal.
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return SDValue();
+
+  // Exit early if the target does not want this transform or if there can't
+  // possibly be enough uses of the divisor to make the transform worthwhile.
+  SDValue N1 = N->getOperand(1);
+  unsigned MinUses = TLI.combineRepeatedFPDivisors();
+  if (!MinUses || N1->use_size() < MinUses)
+    return SDValue();
+
+  // Find all FDIV users of the same divisor.
+  // Use a set because duplicates may be present in the user list.
+  SetVector<SDNode *> Users;
+  for (auto *U : N1->uses())
+    if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
+      Users.insert(U);
+
+  // Now that we have the actual number of divisor uses, make sure it meets
+  // the minimum threshold specified by the target.
+  if (Users.size() < MinUses)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+  // FIXME: This optimization requires some level of fast-math, so the
+  // created reciprocal node should at least have the 'allowReciprocal'
+  // fast-math-flag set.
+  SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1);
+
+  // Dividend / Divisor -> Dividend * Reciprocal
+  for (auto *U : Users) {
+    SDValue Dividend = U->getOperand(0);
+    if (Dividend != FPOne) {
+      SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
+                                    Reciprocal);
+      CombineTo(U, NewNode);
+    } else if (U != Reciprocal.getNode()) {
+      // In the absence of fast-math-flags, this user node is always the
+      // same node as Reciprocal, but with FMF they may be different nodes.
+      CombineTo(U, Reciprocal);
+    }
+  }
+  return SDValue(N, 0);  // N was replaced.
+}
+
  SDValue DAGCombiner::visitFDIV(SDNode *N) {
    SDValue N0 = N->getOperand(0);
    SDValue N1 = N->getOperand(1);
@@ -8353,48 +8386,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
      }
    }
  
-  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
-  // reciprocal.
-  // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
-  // Notice that this is not always beneficial. One reason is different target
-  // may have different costs for FDIV and FMUL, so sometimes the cost of two
-  // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
-  // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
-  if (Options.UnsafeFPMath) {
-    // Skip if current node is a reciprocal.
-    if (N0CFP && N0CFP->isExactlyValue(1.0))
-      return SDValue();
-
-    SmallVector<SDNode *, 4> Users;
-    // Find all FDIV users of the same divisor.
-    for (auto *U : N1->uses()) {
-      if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
-        Users.push_back(U);
-    }
-
-    if (TLI.combineRepeatedFPDivisors(Users.size())) {
-      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
-      // FIXME: This optimization requires some level of fast-math, so the
-      // created reciprocal node should at least have the 'allowReciprocal'
-      // fast-math-flag set.
-      SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1);
-
-      // Dividend / Divisor -> Dividend * Reciprocal
-      for (auto *U : Users) {
-        SDValue Dividend = U->getOperand(0);
-        if (Dividend != FPOne) {
-          SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
-                                        Reciprocal);
-          CombineTo(U, NewNode);
-        } else if (U != Reciprocal.getNode()) {
-          // In the absence of fast-math-flags, this user node is always the
-          // same node as Reciprocal, but with FMF they may be different nodes.
-          CombineTo(U, Reciprocal);
-        }
-      }
-      return SDValue(N, 0);  // N was replaced.
-    }
-  }
+  if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
+    return CombineRepeatedDivisors;
  
    return SDValue();
  }
@@ -8421,7 +8414,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
    SDValue RV = BuildRsqrtEstimate(N->getOperand(0));
    if (!RV)
      return SDValue();
-  
+
    EVT VT = RV.getValueType();
    SDLoc DL(N);
    RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV);
@@ -9034,8 +9027,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
      SDValue Op1 = TheXor->getOperand(1);
      if (Op0.getOpcode() == Op1.getOpcode()) {
        // Avoid missing important xor optimizations.
-      SDValue Tmp = visitXOR(TheXor);
-      if (Tmp.getNode()) {
+      if (SDValue Tmp = visitXOR(TheXor)) {
          if (Tmp.getNode() != TheXor) {
            DEBUG(dbgs() << "\nReplacing.8 ";
                  TheXor->dump(&DAG);
@@ -10743,17 +10735,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
    return true;
  }
  
-static bool allowableAlignment(const SelectionDAG &DAG,
-                               const TargetLowering &TLI, EVT EVTTy,
-                               unsigned AS, unsigned Align) {
-  if (TLI.allowsMisalignedMemoryAccesses(EVTTy, AS, Align))
-    return true;
-
-  Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext());
-  unsigned ABIAlignment = DAG.getDataLayout().getPrefTypeAlignment(Ty);
-  return (Align >= ABIAlignment);
-}
-
  void DAGCombiner::getStoreMergeAndAliasCandidates(
      StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
      SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
@@ -10868,12 +10849,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    // We need to make sure that these nodes do not interfere with
    // any of the store nodes.
    SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
-  
+
    // Save the StoreSDNodes that we find in the chain.
    SmallVector<MemOpLink, 8> StoreNodes;
  
    getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
-  
+
    // Check if there is anything to merge.
    if (StoreNodes.size() < 2)
      return false;
@@ -10919,6 +10900,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
  
    // Store the constants into memory as one consecutive store.
    if (IsConstantSrc) {
@@ -10940,27 +10923,28 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
  
        // Find a legal type for the constant store.
        unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
        if (TLI.isTypeLegal(StoreTy) &&
-          allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS,
-                             FirstStoreAlign)) {
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign)) {
          LastLegalType = i+1;
        // Or check whether a truncstore is legal.
-      } else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
                   TargetLowering::TypePromoteInteger) {
          EVT LegalizedStoredValueTy =
-          TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
+          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS,
-                               FirstStoreAlign)) {
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstStoreAS, FirstStoreAlign)) {
            LastLegalType = i + 1;
          }
        }
  
        // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
        if (TLI.isTypeLegal(Ty) &&
-          allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) {
+          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                 FirstStoreAlign)) {
          LastLegalVectorType = i + 1;
        }
      }
@@ -11004,9 +10988,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
          return false;
  
        // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
        if (TLI.isTypeLegal(Ty) &&
-          allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign))
+          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                 FirstStoreAlign))
          NumElem = i + 1;
      }
  
@@ -11094,33 +11079,37 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
      LastConsecutiveLoad = i;
  
      // Find a legal type for the vector store.
-    EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+    EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1);
      if (TLI.isTypeLegal(StoreTy) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) {
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign) &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign)) {
        LastLegalVectorType = i + 1;
      }
  
      // Find a legal type for the integer store.
      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-    StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    StoreTy = EVT::getIntegerVT(Context, SizeInBits);
      if (TLI.isTypeLegal(StoreTy) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign))
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign) &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign))
        LastLegalIntegerType = i + 1;
      // Or check whether a truncstore and extload is legal.
-    else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+    else if (TLI.getTypeAction(Context, StoreTy) ==
               TargetLowering::TypePromoteInteger) {
        EVT LegalizedStoredValueTy =
-        TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
+        TLI.getTypeToTransformTo(Context, StoreTy);
        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS,
-                             FirstStoreAlign) &&
-          allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstLoadAS,
-                             FirstLoadAlign))
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstStoreAS, FirstStoreAlign) &&
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstLoadAS, FirstLoadAlign))
          LastLegalIntegerType = i+1;
      }
    }
@@ -11155,10 +11144,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    // to memory.
    EVT JointMemOpVT;
    if (UseVectorTy) {
-    JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
    } else {
      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-    JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
    }
  
    SDLoc LoadDL(LoadNodes[0].MemNode);
@@ -11319,8 +11308,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
  
    // Try transforming a pair floating point load / store ops to integer
    // load / store ops.
-  SDValue NewST = TransformFPLoadStorePair(N);
-  if (NewST.getNode())
+  if (SDValue NewST = TransformFPLoadStorePair(N))
      return NewST;
  
    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
@@ -13002,34 +12990,76 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
    if (RHS.getOpcode() == ISD::BITCAST)
      RHS = RHS.getOperand(0);
  
-  if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  EVT RVT = RHS.getValueType();
+  unsigned NumElts = RHS.getNumOperands();
+
+  // Attempt to create a valid clear mask, splitting the mask into
+  // sub elements and checking to see if each is
+  // all zeros or all ones - suitable for shuffle masking.
+  auto BuildClearMask = [&](int Split) {
+    int NumSubElts = NumElts * Split;
+    int NumSubBits = RVT.getScalarSizeInBits() / Split;
+
      SmallVector<int, 8> Indices;
-    unsigned NumElts = RHS.getNumOperands();
+    for (int i = 0; i != NumSubElts; ++i) {
+      int EltIdx = i / Split;
+      int SubIdx = i % Split;
+      SDValue Elt = RHS.getOperand(EltIdx);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Indices.push_back(-1);
+        continue;
+      }
  
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Elt = RHS.getOperand(i);
-      if (isAllOnesConstant(Elt))
+      APInt Bits;
+      if (isa<ConstantSDNode>(Elt))
+        Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
+      else if (isa<ConstantFPSDNode>(Elt))
+        Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
+      else
+        return SDValue();
+
+      // Extract the sub element from the constant bit mask.
+      if (DAG.getDataLayout().isBigEndian()) {
+        Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits);
+      } else {
+        Bits = Bits.lshr(SubIdx * NumSubBits);
+      }
+
+      if (Split > 1)
+        Bits = Bits.trunc(NumSubBits);
+
+      if (Bits.isAllOnesValue())
          Indices.push_back(i);
-      else if (isNullConstant(Elt))
-        Indices.push_back(NumElts+i);
+      else if (Bits == 0)
+        Indices.push_back(i + NumSubElts);
        else
          return SDValue();
      }
  
      // Let's see if the target supports this vector_shuffle.
-    EVT RVT = RHS.getValueType();
-    if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+    EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
+    EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
+    if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
        return SDValue();
  
-    // Return the new VECTOR_SHUFFLE node.
-    EVT EltVT = RVT.getVectorElementType();
-    SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
-                                   DAG.getConstant(0, dl, EltVT));
-    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, RVT, ZeroOps);
-    LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
-    SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
-    return DAG.getNode(ISD::BITCAST, dl, VT, Shuf);
-  }
+    SDValue Zero = DAG.getConstant(0, dl, ClearVT);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl,
+                                                   DAG.getBitcast(ClearVT, LHS),
+                                                   Zero, &Indices[0]));
+  };
+
+  // Determine maximum split level (byte level masking).
+  int MaxSplit = 1;
+  if (RVT.getScalarSizeInBits() % 8 == 0)
+    MaxSplit = RVT.getScalarSizeInBits() / 8;
+
+  for (int Split = 1; Split <= MaxSplit; ++Split)
+    if (RVT.getScalarSizeInBits() % Split == 0)
+      if (SDValue S = BuildClearMask(Split))
+        return S;
  
    return SDValue();
  }
@@ -13042,9 +13072,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
    SDValue LHS = N->getOperand(0);
    SDValue RHS = N->getOperand(1);
  
-  if (SDValue Shuffle = XformToShuffleWithZero(N))
-    return Shuffle;
-
    // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
    // this operation.
    if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
@@ -13095,6 +13122,10 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
        return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
    }
  
+  // Try to convert a constant mask AND into a shuffle clear mask.
+  if (SDValue Shuffle = XformToShuffleWithZero(N))
+    return Shuffle;
+
    // Type legalization might introduce new shuffles in the DAG.
    // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
    //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
@@ -13844,6 +13875,15 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
    // If they are both volatile then they cannot be reordered.
    if (Op0->isVolatile() && Op1->isVolatile()) return true;
  
+  // If one operation reads from invariant memory, and the other may store, they
+  // cannot alias. These should really be checking the equivalent of mayWrite,
+  // but it only matters for memory nodes other than load /store.
+  if (Op0->isInvariant() && Op1->writeMem())
+    return false;
+
+  if (Op1->isInvariant() && Op0->writeMem())
+    return false;
+
    // Gather base node and offset information.
    SDValue Base1, Base2;
    int64_t Offset1, Offset2;