Do not use dyn_cast<> after isa<>

[oota-llvm.git] / lib / CodeGen / SelectionDAG / DAGCombiner.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 52d620b1d540fc5958cf7cc8d63c645c6bea8203..ab3a6631d3393ffb569189e53d443c792f23e6ef 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -245,6 +245,7 @@ namespace {
      SDValue visitUMULO(SDNode *N);
      SDValue visitSDIVREM(SDNode *N);
      SDValue visitUDIVREM(SDNode *N);
+    SDValue visitIMINMAX(SDNode *N);
      SDValue visitAND(SDNode *N);
      SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference);
      SDValue visitOR(SDNode *N);
@@ -338,6 +339,7 @@ namespace {
                                           unsigned HiOp);
      SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
      SDValue CombineExtLoad(SDNode *N);
+    SDValue combineRepeatedFPDivisors(SDNode *N);
      SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
      SDValue BuildSDIV(SDNode *N);
      SDValue BuildSDIVPow2(SDNode *N);
@@ -409,7 +411,7 @@ namespace {
      void getStoreMergeAndAliasCandidates(
          StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
          SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
-    
+
      /// Merge consecutive store operations into a wide store.
      /// This optimization uses wide integers or vectors when possible.
      /// \return True if some memory operations were changed.
@@ -427,9 +429,7 @@ namespace {
      DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
          : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
            OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
-      auto *F = DAG.getMachineFunction().getFunction();
-      ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) ||
-                    F->hasFnAttribute(Attribute::MinSize);
+      ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
      }
  
      /// Runs the dag combiner on all nodes in the work list
@@ -1216,9 +1216,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
    LegalTypes = Level >= AfterLegalizeTypes;
  
    // Add all the dag nodes to the worklist.
-  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = DAG.allnodes_end(); I != E; ++I)
-    AddToWorklist(I);
+  for (SDNode &Node : DAG.allnodes())
+    AddToWorklist(&Node);
  
    // Create a dummy node (which is not added to allnodes), that adds a reference
    // to the root node, preventing it from being deleted, and tracking any
@@ -1343,6 +1342,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
    case ISD::UMULO:              return visitUMULO(N);
    case ISD::SDIVREM:            return visitSDIVREM(N);
    case ISD::UDIVREM:            return visitUDIVREM(N);
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:               return visitIMINMAX(N);
    case ISD::AND:                return visitAND(N);
    case ISD::OR:                 return visitOR(N);
    case ISD::XOR:                return visitXOR(N);
@@ -2178,6 +2181,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
                           N0, N1);
    }
  
+  bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
    // fold (sdiv X, pow2) -> simple ops after legalize
    // FIXME: We check for the exact bit here because the generic lowering gives
    // better results in that case. The target-specific lowering should learn how
@@ -2186,14 +2190,12 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
        !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() &&
        (N1C->getAPIntValue().isPowerOf2() ||
         (-N1C->getAPIntValue()).isPowerOf2())) {
-    // If dividing by powers of two is cheap, then don't perform the following
-    // fold.
-    if (TLI.isPow2SDivCheap())
+    // If integer division is cheap, then don't perform the following fold.
+    if (TLI.isIntDivCheap(N->getValueType(0), MinSize))
        return SDValue();
  
      // Target-specific implementation of sdiv x, pow2.
-    SDValue Res = BuildSDIVPow2(N);
-    if (Res.getNode())
+    if (SDValue Res = BuildSDIVPow2(N))
        return Res;
  
      unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
@@ -2229,10 +2231,9 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
  
    // If integer divide is expensive and we satisfy the requirements, emit an
    // alternate sequence.
-  if (N1C && !TLI.isIntDivCheap()) {
-    SDValue Op = BuildSDIV(N);
-    if (Op.getNode()) return Op;
-  }
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), MinSize))
+    if (SDValue Op = BuildSDIV(N))
+      return Op;
  
    // undef / X -> 0
    if (N0.getOpcode() == ISD::UNDEF)
@@ -2284,11 +2285,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
        }
      }
    }
+  
    // fold (udiv x, c) -> alternate
-  if (N1C && !TLI.isIntDivCheap()) {
-    SDValue Op = BuildUDIV(N);
-    if (Op.getNode()) return Op;
-  }
+  bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), MinSize))
+    if (SDValue Op = BuildUDIV(N))
+      return Op;
  
    // undef / X -> 0
    if (N0.getOpcode() == ISD::UNDEF)
@@ -2532,8 +2534,8 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
  }
  
  SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
+    return Res;
  
    EVT VT = N->getValueType(0);
    SDLoc DL(N);
@@ -2563,8 +2565,8 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
  }
  
  SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
+    return Res;
  
    EVT VT = N->getValueType(0);
    SDLoc DL(N);
@@ -2614,15 +2616,39 @@ SDValue DAGCombiner::visitUMULO(SDNode *N) {
  }
  
  SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM))
+    return Res;
  
    return SDValue();
  }
  
  SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
-  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM);
-  if (Res.getNode()) return Res;
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM))
+    return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  // fold (add c1, c2) -> c1+c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);
+
+  // canonicalize constant to RHS
+  if (isConstantIntBuildVectorOrConstantInt(N0) &&
+     !isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
  
    return SDValue();
  }
@@ -3142,10 +3168,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
      return Combined;
  
    // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
  
    // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
    // fold (and (sra)) -> (and (srl)) when possible.
@@ -3665,11 +3690,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
      return Combined;
  
    // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
-  SDValue BSwap = MatchBSwapHWord(N, N0, N1);
-  if (BSwap.getNode())
+  if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
      return BSwap;
-  BSwap = MatchBSwapHWordLow(N, N0, N1);
-  if (BSwap.getNode())
+  if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
      return BSwap;
  
    // reassociate or
@@ -3690,10 +3713,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
      }
    }
    // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
  
    // See if this is some rotate idiom.
    if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
@@ -4112,10 +4134,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
    }
  
    // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
-  if (N0.getOpcode() == N1.getOpcode()) {
-    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
-    if (Tmp.getNode()) return Tmp;
-  }
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
  
    // Simplify the expression using non-local knowledge.
    if (!VT.isVector() &&
@@ -4434,12 +4455,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
      return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
    }
  
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSHL = visitShiftByConstant(N, N1C);
-    if (NewSHL.getNode())
-      return NewSHL;
+  // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
+  if (N1C && N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse()) {
+    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      SDValue Folded = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, N0C1, N1C);
+      return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Folded);
+    }
    }
  
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSHL = visitShiftByConstant(N, N1C))
+      return NewSHL;
+
    return SDValue();
  }
  
@@ -4583,11 +4610,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
    if (DAG.SignBitIsZero(N0))
      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
  
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSRA = visitShiftByConstant(N, N1C);
-    if (NewSRA.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRA = visitShiftByConstant(N, N1C))
        return NewSRA;
-  }
  
    return SDValue();
  }
@@ -4744,8 +4769,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
    // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
    if (N1.getOpcode() == ISD::TRUNCATE &&
        N1.getOperand(0).getOpcode() == ISD::AND) {
-    SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode());
-    if (NewOp1.getNode())
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
        return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
    }
  
@@ -4754,15 +4778,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
    if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
      return SDValue(N, 0);
  
-  if (N1C && !N1C->isOpaque()) {
-    SDValue NewSRL = visitShiftByConstant(N, N1C);
-    if (NewSRL.getNode())
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRL = visitShiftByConstant(N, N1C))
        return NewSRL;
-  }
  
    // Attempt to convert a srl of a load into a narrower zero-extending load.
-  SDValue NarrowLoad = ReduceLoadWidth(N);
-  if (NarrowLoad.getNode())
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
      return NarrowLoad;
  
    // Here is a common situation. We want to optimize:
@@ -4973,70 +4994,47 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
    if (SimplifySelectOps(N, N1, N2))
      return SDValue(N, 0);  // Don't revisit N.
  
-  // fold selects based on a setcc into other things, such as min/max/abs
-  if (N0.getOpcode() == ISD::SETCC) {
-    // select x, y (fcmp lt x, y) -> fminnum x, y
-    // select x, y (fcmp gt x, y) -> fmaxnum x, y
-    //
-    // This is OK if we don't care about what happens if either operand is a
-    // NaN.
-    //
-
-    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
-    // no signed zeros as well as no nans.
-    const TargetOptions &Options = DAG.getTarget().Options;
-    if (Options.UnsafeFPMath &&
-        VT.isFloatingPoint() && N0.hasOneUse() &&
-        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
-      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-
-      SDValue FMinMax =
-          combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1),
-                              N1, N2, CC, TLI, DAG);
-      if (FMinMax)
-        return FMinMax;
-    }
-
-    if ((!LegalOperations &&
-         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
-        TLI.isOperationLegal(ISD::SELECT_CC, VT))
-      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1),
-                         N1, N2, N0.getOperand(2));
-    return SimplifySelect(SDLoc(N), N0, N1, N2);
-  }
-
    if (VT0 == MVT::i1) {
-    if (TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) {
-      // select (and Cond0, Cond1), X, Y
-      //   -> select Cond0, (select Cond1, X, Y), Y
-      if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
-        SDValue Cond0 = N0->getOperand(0);
-        SDValue Cond1 = N0->getOperand(1);
-        SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
-                                          N1.getValueType(), Cond1, N1, N2);
+    // The code in this block deals with the following 2 equivalences:
+    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
+    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
+    // The target can specify its prefered form with the
+    // shouldNormalizeToSelectSequence() callback. However we always transform
+    // to the right anyway if we find the inner select exists in the DAG anyway
+    // and we always transform to the left side if we know that we can further
+    // optimize the combination of the conditions.
+    bool normalizeToSequence
+      = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    // select (and Cond0, Cond1), X, Y
+    //   -> select Cond0, (select Cond1, X, Y), Y
+    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
+                                        N1.getValueType(), Cond1, N1, N2);
+      if (normalizeToSequence || !InnerSelect.use_empty())
          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0,
                             InnerSelect, N2);
-      }
-      // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
-      if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
-        SDValue Cond0 = N0->getOperand(0);
-        SDValue Cond1 = N0->getOperand(1);
-        SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
-                                          N1.getValueType(), Cond1, N1, N2);
+    }
+    // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
+    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
+                                        N1.getValueType(), Cond1, N1, N2);
+      if (normalizeToSequence || !InnerSelect.use_empty())
          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, N1,
                             InnerSelect);
-      }
      }
  
      // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
-    if (N1->getOpcode() == ISD::SELECT) {
+    if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
        SDValue N1_0 = N1->getOperand(0);
        SDValue N1_1 = N1->getOperand(1);
        SDValue N1_2 = N1->getOperand(2);
        if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
          // Create the actual and node if we can generate good code for it.
-        if (!TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) {
+        if (!normalizeToSequence) {
            SDValue And = DAG.getNode(ISD::AND, SDLoc(N), N0.getValueType(),
                                      N0, N1_0);
            return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), And,
@@ -5049,13 +5047,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
        }
      }
      // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
-    if (N2->getOpcode() == ISD::SELECT) {
+    if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
        SDValue N2_0 = N2->getOperand(0);
        SDValue N2_1 = N2->getOperand(1);
        SDValue N2_2 = N2->getOperand(2);
        if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
          // Create the actual or node if we can generate good code for it.
-        if (!TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) {
+        if (!normalizeToSequence) {
            SDValue Or = DAG.getNode(ISD::OR, SDLoc(N), N0.getValueType(),
                                     N0, N2_0);
            return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Or,
@@ -5069,6 +5067,38 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
      }
    }
  
+  // fold selects based on a setcc into other things, such as min/max/abs
+  if (N0.getOpcode() == ISD::SETCC) {
+    // select x, y (fcmp lt x, y) -> fminnum x, y
+    // select x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+
+    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
+    // no signed zeros as well as no nans.
+    const TargetOptions &Options = DAG.getTarget().Options;
+    if (Options.UnsafeFPMath &&
+        VT.isFloatingPoint() && N0.hasOneUse() &&
+        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+      if (SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0),
+                                                N0.getOperand(1), N1, N2, CC,
+                                                TLI, DAG))
+        return FMinMax;
+    }
+
+    if ((!LegalOperations &&
+         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
+        TLI.isOperationLegal(ISD::SELECT_CC, VT))
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         N1, N2, N0.getOperand(2));
+    return SimplifySelect(SDLoc(N), N0, N1, N2);
+  }
+
    return SDValue();
  }
  
@@ -5523,8 +5553,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
    if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
        N2.getOpcode() == ISD::CONCAT_VECTORS &&
        ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
-    SDValue CV = ConvertSelectToConcatVector(N, DAG);
-    if (CV.getNode())
+    if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
        return CV;
    }
  
@@ -5580,7 +5609,7 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
                         SDLoc(N));
  }
  
-/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or 
+/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
  /// a build_vector of constants.
  /// This function is called by the DAGCombiner when visiting sext/zext/aext
  /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
@@ -5837,8 +5866,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
    if (N0.getOpcode() == ISD::TRUNCATE) {
      // fold (sext (truncate (load x))) -> (sext (smaller load x))
      // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6120,8 +6148,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
    // fold (zext (truncate (load x))) -> (zext (smaller load x))
    // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
    if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6133,32 +6160,45 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
    }
  
    // fold (zext (truncate x)) -> (and x, mask)
-  if (N0.getOpcode() == ISD::TRUNCATE &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
-
+  if (N0.getOpcode() == ISD::TRUNCATE) {
      // fold (zext (truncate (load x))) -> (zext (smaller load x))
      // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
-      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+      SDNode *oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
          // CombineTo deleted the truncate, if needed, but not what's under it.
          AddToWorklist(oye);
        }
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
      }
  
-    SDValue Op = N0.getOperand(0);
-    if (Op.getValueType().bitsLT(VT)) {
-      Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
-      AddToWorklist(Op.getNode());
-    } else if (Op.getValueType().bitsGT(VT)) {
-      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-      AddToWorklist(Op.getNode());
+    EVT SrcVT = N0.getOperand(0).getValueType();
+    EVT MinVT = N0.getValueType();
+
+    // Try to mask before the extension to avoid having to generate a larger mask,
+    // possibly over several sub-vectors.
+    if (SrcVT.bitsLT(VT)) {
+      if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
+                               TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
+        SDValue Op = N0.getOperand(0);
+        Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+        AddToWorklist(Op.getNode());
+        return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
+      }
+    }
+
+    if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
+      SDValue Op = N0.getOperand(0);
+      if (SrcVT.bitsLT(VT)) {
+        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
+        AddToWorklist(Op.getNode());
+      } else if (SrcVT.bitsGT(VT)) {
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+        AddToWorklist(Op.getNode());
+      }
+      return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
      }
-    return DAG.getZeroExtendInReg(Op, SDLoc(N),
-                                  N0.getValueType().getScalarType());
    }
  
    // Fold (zext (and (trunc x), cst)) -> (and x, cst),
@@ -6378,8 +6418,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
    // fold (aext (truncate (load x))) -> (aext (smaller load x))
    // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
    if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
-    if (NarrowLoad.getNode()) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
        SDNode* oye = N0.getNode()->getOperand(0).getNode();
        if (NarrowLoad.getNode() != N0.getNode()) {
          CombineTo(N0.getNode(), NarrowLoad);
@@ -6546,8 +6585,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
        // Watch out for shift count overflow though.
        if (Amt >= Mask.getBitWidth()) break;
        APInt NewMask = Mask << Amt;
-      SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
-      if (SimplifyLHS.getNode())
+      if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
          return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                             SimplifyLHS, V.getOperand(1));
      }
@@ -6771,8 +6809,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
  
    // fold (sext_in_reg (load x)) -> (smaller sextload x)
    // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
-  SDValue NarrowLoad = ReduceLoadWidth(N);
-  if (NarrowLoad.getNode())
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
      return NarrowLoad;
  
    // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
@@ -6999,9 +7036,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
    // fold (truncate (load x)) -> (smaller load x)
    // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
    if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
-    SDValue Reduced = ReduceLoadWidth(N);
-    if (Reduced.getNode())
+    if (SDValue Reduced = ReduceLoadWidth(N))
        return Reduced;
+
      // Handle the case where the load remains an extending load even
      // after truncation.
      if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
@@ -7240,11 +7277,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
    }
  
    // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
-  if (N0.getOpcode() == ISD::BUILD_PAIR) {
-    SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT);
-    if (CombineLD.getNode())
+  if (N0.getOpcode() == ISD::BUILD_PAIR)
+    if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
        return CombineLD;
-  }
  
    // Remove double bitcasts from shuffles - this is often a legacy of
    // XformToShuffleWithZero being used to combine bitmaskings (of
@@ -7257,10 +7292,10 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
      ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
  
      // If operands are a bitcast, peek through if it casts the original VT.
-    // If operands are a UNDEF or constant, just bitcast back to original VT.
+    // If operands are a constant, just bitcast back to original VT.
      auto PeekThroughBitcast = [&](SDValue Op) {
        if (Op.getOpcode() == ISD::BITCAST &&
-          Op.getOperand(0)->getValueType(0) == VT)
+          Op.getOperand(0).getValueType() == VT)
          return SDValue(Op.getOperand(0));
        if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
@@ -7453,6 +7488,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
    bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
    bool LookThroughFPExt = TLI.isFPExtFree(VT);
  
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && N0.getOpcode() == ISD::FMUL &&
+      N1.getOpcode() == ISD::FMUL) {
+    if (N0.getNode()->use_size() > N1.getNode()->use_size())
+      std::swap(N0, N1);
+  }
+
    // fold (fadd (fmul x, y), z) -> (fma x, y, z)
    if (N0.getOpcode() == ISD::FMUL &&
        (Aggressive || N0->hasOneUse())) {
@@ -8003,8 +8046,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
    } // enable-unsafe-fp-math
  
    // FADD -> FMA combines:
-  SDValue Fused = visitFADDForFMACombine(N);
-  if (Fused) {
+  if (SDValue Fused = visitFADDForFMACombine(N)) {
      AddToWorklist(Fused.getNode());
      return Fused;
    }
@@ -8068,8 +8110,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
    }
  
    // FSUB -> FMA combines:
-  SDValue Fused = visitFSUBForFMACombine(N);
-  if (Fused) {
+  if (SDValue Fused = visitFSUBForFMACombine(N)) {
      AddToWorklist(Fused.getNode());
      return Fused;
    }
@@ -8139,7 +8180,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
      // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
      // during an early run of DAGCombiner can prevent folding with fmuls
      // inserted during lowering.
-    if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) {
+    if (N0.getOpcode() == ISD::FADD &&
+        (N0.getOperand(0) == N0.getOperand(1)) &&
+        N0.hasOneUse()) {
        const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
        SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
        return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
@@ -8253,6 +8296,66 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
    return SDValue();
  }
  
+// Combine multiple FDIVs with the same divisor into multiple FMULs by the
+// reciprocal.
+// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+// Notice that this is not always beneficial. One reason is different target
+// may have different costs for FDIV and FMUL, so sometimes the cost of two
+// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // Skip if current node is a reciprocal.
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return SDValue();
+
+  // Exit early if the target does not want this transform or if there can't
+  // possibly be enough uses of the divisor to make the transform worthwhile.
+  SDValue N1 = N->getOperand(1);
+  unsigned MinUses = TLI.combineRepeatedFPDivisors();
+  if (!MinUses || N1->use_size() < MinUses)
+    return SDValue();
+
+  // Find all FDIV users of the same divisor.
+  // Use a set because duplicates may be present in the user list.
+  SetVector<SDNode *> Users;
+  for (auto *U : N1->uses())
+    if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
+      Users.insert(U);
+
+  // Now that we have the actual number of divisor uses, make sure it meets
+  // the minimum threshold specified by the target.
+  if (Users.size() < MinUses)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+  // FIXME: This optimization requires some level of fast-math, so the
+  // created reciprocal node should at least have the 'allowReciprocal'
+  // fast-math-flag set.
+  SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1);
+
+  // Dividend / Divisor -> Dividend * Reciprocal
+  for (auto *U : Users) {
+    SDValue Dividend = U->getOperand(0);
+    if (Dividend != FPOne) {
+      SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
+                                    Reciprocal);
+      CombineTo(U, NewNode);
+    } else if (U != Reciprocal.getNode()) {
+      // In the absence of fast-math-flags, this user node is always the
+      // same node as Reciprocal, but with FMF they may be different nodes.
+      CombineTo(U, Reciprocal);
+    }
+  }
+  return SDValue(N, 0);  // N was replaced.
+}
+
  SDValue DAGCombiner::visitFDIV(SDNode *N) {
    SDValue N0 = N->getOperand(0);
    SDValue N1 = N->getOperand(1);
@@ -8353,48 +8456,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
      }
    }
  
-  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
-  // reciprocal.
-  // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
-  // Notice that this is not always beneficial. One reason is different target
-  // may have different costs for FDIV and FMUL, so sometimes the cost of two
-  // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
-  // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
-  if (Options.UnsafeFPMath) {
-    // Skip if current node is a reciprocal.
-    if (N0CFP && N0CFP->isExactlyValue(1.0))
-      return SDValue();
-
-    SmallVector<SDNode *, 4> Users;
-    // Find all FDIV users of the same divisor.
-    for (auto *U : N1->uses()) {
-      if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1)
-        Users.push_back(U);
-    }
-
-    if (TLI.combineRepeatedFPDivisors(Users.size())) {
-      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
-      // FIXME: This optimization requires some level of fast-math, so the
-      // created reciprocal node should at least have the 'allowReciprocal'
-      // fast-math-flag set.
-      SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1);
-
-      // Dividend / Divisor -> Dividend * Reciprocal
-      for (auto *U : Users) {
-        SDValue Dividend = U->getOperand(0);
-        if (Dividend != FPOne) {
-          SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
-                                        Reciprocal);
-          CombineTo(U, NewNode);
-        } else if (U != Reciprocal.getNode()) {
-          // In the absence of fast-math-flags, this user node is always the
-          // same node as Reciprocal, but with FMF they may be different nodes.
-          CombineTo(U, Reciprocal);
-        }
-      }
-      return SDValue(N, 0);  // N was replaced.
-    }
-  }
+  if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
+    return CombineRepeatedDivisors;
  
    return SDValue();
  }
@@ -8421,7 +8484,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
    SDValue RV = BuildRsqrtEstimate(N->getOperand(0));
    if (!RV)
      return SDValue();
-  
+
    EVT VT = RV.getValueType();
    SDLoc DL(N);
    RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV);
@@ -9034,8 +9097,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
      SDValue Op1 = TheXor->getOperand(1);
      if (Op0.getOpcode() == Op1.getOpcode()) {
        // Avoid missing important xor optimizations.
-      SDValue Tmp = visitXOR(TheXor);
-      if (Tmp.getNode()) {
+      if (SDValue Tmp = visitXOR(TheXor)) {
          if (Tmp.getNode() != TheXor) {
            DEBUG(dbgs() << "\nReplacing.8 ";
                  TheXor->dump(&DAG);
@@ -9722,8 +9784,8 @@ struct LoadedSlice {
      void addSliceGain(const LoadedSlice &LS) {
        // Each slice saves a truncate.
        const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
-      if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
-                              LS.Inst->getOperand(0).getValueType()))
+      if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
+                              LS.Inst->getValueType(0)))
          ++Truncates;
        // If there is a shift amount, this slice gets rid of it.
        if (LS.Shift)
@@ -10743,17 +10805,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
    return true;
  }
  
-static bool allowableAlignment(const SelectionDAG &DAG,
-                               const TargetLowering &TLI, EVT EVTTy,
-                               unsigned AS, unsigned Align) {
-  if (TLI.allowsMisalignedMemoryAccesses(EVTTy, AS, Align))
-    return true;
-
-  Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext());
-  unsigned ABIAlignment = DAG.getDataLayout().getPrefTypeAlignment(Ty);
-  return (Align >= ABIAlignment);
-}
-
  void DAGCombiner::getStoreMergeAndAliasCandidates(
      StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
      SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
@@ -10868,12 +10919,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    // We need to make sure that these nodes do not interfere with
    // any of the store nodes.
    SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
-  
+
    // Save the StoreSDNodes that we find in the chain.
    SmallVector<MemOpLink, 8> StoreNodes;
  
    getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
-  
+
    // Check if there is anything to merge.
    if (StoreNodes.size() < 2)
      return false;
@@ -10919,6 +10970,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
  
    // Store the constants into memory as one consecutive store.
    if (IsConstantSrc) {
@@ -10940,27 +10993,28 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
  
        // Find a legal type for the constant store.
        unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
        if (TLI.isTypeLegal(StoreTy) &&
-          allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS,
-                             FirstStoreAlign)) {
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign)) {
          LastLegalType = i+1;
        // Or check whether a truncstore is legal.
-      } else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
                   TargetLowering::TypePromoteInteger) {
          EVT LegalizedStoredValueTy =
-          TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
+          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS,
-                               FirstStoreAlign)) {
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstStoreAS, FirstStoreAlign)) {
            LastLegalType = i + 1;
          }
        }
  
        // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
        if (TLI.isTypeLegal(Ty) &&
-          allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) {
+          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                 FirstStoreAlign)) {
          LastLegalVectorType = i + 1;
        }
      }
@@ -11004,9 +11058,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
          return false;
  
        // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
        if (TLI.isTypeLegal(Ty) &&
-          allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign))
+          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                 FirstStoreAlign))
          NumElem = i + 1;
      }
  
@@ -11094,33 +11149,37 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
      LastConsecutiveLoad = i;
  
      // Find a legal type for the vector store.
-    EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+    EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1);
      if (TLI.isTypeLegal(StoreTy) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) {
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign) &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign)) {
        LastLegalVectorType = i + 1;
      }
  
      // Find a legal type for the integer store.
      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-    StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    StoreTy = EVT::getIntegerVT(Context, SizeInBits);
      if (TLI.isTypeLegal(StoreTy) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) &&
-        allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign))
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign) &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign))
        LastLegalIntegerType = i + 1;
      // Or check whether a truncstore and extload is legal.
-    else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+    else if (TLI.getTypeAction(Context, StoreTy) ==
               TargetLowering::TypePromoteInteger) {
        EVT LegalizedStoredValueTy =
-        TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
+        TLI.getTypeToTransformTo(Context, StoreTy);
        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS,
-                             FirstStoreAlign) &&
-          allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstLoadAS,
-                             FirstLoadAlign))
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstStoreAS, FirstStoreAlign) &&
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstLoadAS, FirstLoadAlign))
          LastLegalIntegerType = i+1;
      }
    }
@@ -11155,10 +11214,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    // to memory.
    EVT JointMemOpVT;
    if (UseVectorTy) {
-    JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
    } else {
      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-    JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
    }
  
    SDLoc LoadDL(LoadNodes[0].MemNode);
@@ -11319,8 +11378,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
  
    // Try transforming a pair floating point load / store ops to integer
    // load / store ops.
-  SDValue NewST = TransformFPLoadStorePair(N);
-  if (NewST.getNode())
+  if (SDValue NewST = TransformFPLoadStorePair(N))
      return NewST;
  
    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
@@ -12183,12 +12241,81 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
                       DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Ops));
  }
  
-SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
-  // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of
-  // EXTRACT_SUBVECTOR operations.  If so, and if the EXTRACT_SUBVECTOR vector
-  // inputs come from at most two distinct vectors, turn this into a shuffle
-  // node.
+// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
+// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
+// most two distinct vectors the same size as the result, attempt to turn this
+// into a legal shuffle.
+static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N->getOperand(0).getValueType();
+  int NumElts = VT.getVectorNumElements();
+  int NumOpElts = OpVT.getVectorNumElements();
+
+  SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
+  SmallVector<int, 8> Mask;
  
+  for (SDValue Op : N->ops()) {
+    // Peek through any bitcast.
+    while (Op.getOpcode() == ISD::BITCAST)
+      Op = Op.getOperand(0);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (Op.getOpcode() == ISD::UNDEF) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return SDValue();
+
+    // What vector are we extracting the subvector from and at what index?
+    SDValue ExtVec = Op.getOperand(0);
+    if (ExtVec.getOpcode() == ISD::UNDEF) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    EVT ExtVT = ExtVec.getValueType();
+    if (!isa<ConstantSDNode>(Op.getOperand(1)))
+      return SDValue();
+    int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    // Ensure that we are extracting a subvector from a vector the same
+    // size as the result.
+    if (ExtVT.getSizeInBits() != VT.getSizeInBits())
+      return SDValue();
+
+    // Scale the subvector index to account for any bitcast.
+    int NumExtElts = ExtVT.getVectorNumElements();
+    if (0 == (NumExtElts % NumElts))
+      ExtIdx /= (NumExtElts / NumElts);
+    else if (0 == (NumElts % NumExtElts))
+      ExtIdx *= (NumElts / NumExtElts);
+    else
+      return SDValue();
+
+    // At most we can reference 2 inputs in the final shuffle.
+    if (SV0.getOpcode() == ISD::UNDEF || SV0 == ExtVec) {
+      SV0 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx);
+    } else if (SV1.getOpcode() == ISD::UNDEF || SV1 == ExtVec) {
+      SV1 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx + NumElts);
+    } else {
+      return SDValue();
+    }
+  }
+
+  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
+                              DAG.getBitcast(VT, SV1), Mask);
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
    // If we only have one input vector, we don't need to do any concatenation.
    if (N->getNumOperands() == 1)
      return N->getOperand(0);
@@ -12289,6 +12416,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
    if (SDValue V = combineConcatVectorOfScalars(N, DAG))
      return V;
  
+  // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
+    if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
+      return V;
+
    // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
    // nodes often generate nop CONCAT_VECTOR nodes.
    // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
@@ -13002,34 +13134,76 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
    if (RHS.getOpcode() == ISD::BITCAST)
      RHS = RHS.getOperand(0);
  
-  if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  EVT RVT = RHS.getValueType();
+  unsigned NumElts = RHS.getNumOperands();
+
+  // Attempt to create a valid clear mask, splitting the mask into
+  // sub elements and checking to see if each is
+  // all zeros or all ones - suitable for shuffle masking.
+  auto BuildClearMask = [&](int Split) {
+    int NumSubElts = NumElts * Split;
+    int NumSubBits = RVT.getScalarSizeInBits() / Split;
+
      SmallVector<int, 8> Indices;
-    unsigned NumElts = RHS.getNumOperands();
+    for (int i = 0; i != NumSubElts; ++i) {
+      int EltIdx = i / Split;
+      int SubIdx = i % Split;
+      SDValue Elt = RHS.getOperand(EltIdx);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Indices.push_back(-1);
+        continue;
+      }
  
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Elt = RHS.getOperand(i);
-      if (isAllOnesConstant(Elt))
+      APInt Bits;
+      if (isa<ConstantSDNode>(Elt))
+        Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
+      else if (isa<ConstantFPSDNode>(Elt))
+        Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
+      else
+        return SDValue();
+
+      // Extract the sub element from the constant bit mask.
+      if (DAG.getDataLayout().isBigEndian()) {
+        Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits);
+      } else {
+        Bits = Bits.lshr(SubIdx * NumSubBits);
+      }
+
+      if (Split > 1)
+        Bits = Bits.trunc(NumSubBits);
+
+      if (Bits.isAllOnesValue())
          Indices.push_back(i);
-      else if (isNullConstant(Elt))
-        Indices.push_back(NumElts+i);
+      else if (Bits == 0)
+        Indices.push_back(i + NumSubElts);
        else
          return SDValue();
      }
  
      // Let's see if the target supports this vector_shuffle.
-    EVT RVT = RHS.getValueType();
-    if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+    EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
+    EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
+    if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
        return SDValue();
  
-    // Return the new VECTOR_SHUFFLE node.
-    EVT EltVT = RVT.getVectorElementType();
-    SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
-                                   DAG.getConstant(0, dl, EltVT));
-    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, RVT, ZeroOps);
-    LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
-    SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
-    return DAG.getNode(ISD::BITCAST, dl, VT, Shuf);
-  }
+    SDValue Zero = DAG.getConstant(0, dl, ClearVT);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl,
+                                                   DAG.getBitcast(ClearVT, LHS),
+                                                   Zero, &Indices[0]));
+  };
+
+  // Determine maximum split level (byte level masking).
+  int MaxSplit = 1;
+  if (RVT.getScalarSizeInBits() % 8 == 0)
+    MaxSplit = RVT.getScalarSizeInBits() / 8;
+
+  for (int Split = 1; Split <= MaxSplit; ++Split)
+    if (RVT.getScalarSizeInBits() % Split == 0)
+      if (SDValue S = BuildClearMask(Split))
+        return S;
  
    return SDValue();
  }
@@ -13042,9 +13216,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
    SDValue LHS = N->getOperand(0);
    SDValue RHS = N->getOperand(1);
  
-  if (SDValue Shuffle = XformToShuffleWithZero(N))
-    return Shuffle;
-
    // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
    // this operation.
    if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
@@ -13095,6 +13266,10 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
        return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
    }
  
+  // Try to convert a constant mask AND into a shuffle clear mask.
+  if (SDValue Shuffle = XformToShuffleWithZero(N))
+    return Shuffle;
+
    // Type legalization might introduce new shuffles in the DAG.
    // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
    //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
@@ -13390,9 +13565,10 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
          CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                              CstOffset);
          AddToWorklist(CPIdx.getNode());
-        return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
-                           MachinePointerInfo::getConstantPool(), false,
-                           false, false, Alignment);
+        return DAG.getLoad(
+            TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+            MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+            false, false, false, Alignment);
        }
      }