From c2328d552c443b196409582ee98c240828b7df44 Mon Sep 17 00:00:00 2001
From: Gerolf Hoflehner <ghoflehner@apple.com>
Date: Tue, 5 Aug 2014 01:16:13 +0000
Subject: [PATCH 1/1] MachineCombiner Pass for selecting faster instruction 
 sequence on AArch64

Re-commit of r214669 without changes to test cases
LLVM::CodeGen/AArch64/arm64-neon-mul-div.ll and
LLVM:: CodeGen/AArch64/dp-3source.ll
This resolves the reported compfails of the original commit.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214832 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrFormats.td     |   5 +-
 lib/Target/AArch64/AArch64InstrInfo.cpp       | 476 +++++++++++++++++-
 lib/Target/AArch64/AArch64InstrInfo.h         |  20 +-
 .../AArch64/AArch64MachineCombinerPattern.h   |  42 ++
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   6 +
 test/CodeGen/AArch64/madd-lohi.ll             |  19 +
 test/CodeGen/AArch64/mul-lohi.ll              |  13 +-
 7 files changed, 561 insertions(+), 20 deletions(-)
 create mode 100644 lib/Target/AArch64/AArch64MachineCombinerPattern.h
 create mode 100644 test/CodeGen/AArch64/madd-lohi.ll
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 106f2158909..3f764239290 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1351,14 +1351,15 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
 }
 
 multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  // MADD/MSUB generation is decided by MachineCombiner.cpp
   def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
       Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 0;
   }
 
   def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
       Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 1;
   }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 48235186176..e9fecbf81d7 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -697,17 +698,12 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   return true;
 }
 
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool AArch64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
-
-  // Replace SUBSWrr with SUBWrr if NZCV is not used.
-  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
-  if (Cmp_NZCV != -1) {
+/// convertFlagSettingOpcode - return opcode that does not
+/// set flags when possible. The caller is responsible to do
+/// the actual substitution and legality checking.
+static unsigned convertFlagSettingOpcode(MachineInstr *MI) {
     unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
+    switch (MI->getOpcode()) {
     default:
       return false;
     case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
@@ -727,7 +723,22 @@ bool AArch64InstrInfo::optimizeCompareInstr(
     case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
     case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
     }
+    return NewOpc;
+}
 
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (Cmp_NZCV != -1) {
+    unsigned Opc = CmpInstr->getOpcode();
+    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    if (NewOpc == Opc)
+      return false;
     const MCInstrDesc &MCID = get(NewOpc);
     CmpInstr->setDesc(MCID);
     CmpInstr->RemoveOperand(Cmp_NZCV);
@@ -2185,3 +2196,448 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::CreateImm(0));
 }
+/// useMachineCombiner - return true when a target supports MachineCombiner
+bool AArch64InstrInfo::useMachineCombiner(void) const {
+  // AArch64 supports the combiner
+  return true;
+}
+//
+// True when Opc sets flag
+static bool isCombineInstrSettingFlag(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 32b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate32(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDWrr:
+  case AArch64::ADDWri:
+  case AArch64::SUBWrr:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSWrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBWri:
+  case AArch64::SUBSWri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 64b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate64(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDXrr:
+  case AArch64::ADDXri:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBXri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate(unsigned Opc) {
+  return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
+}
+
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineInstr *MI = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    MI = MRI.getUniqueVRegDef(MO.getReg());
+  // And it needs to be in the trace (otherwise, it won't have a depth).
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
+    return false;
+
+  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+
+  // The third input reg must be zero.
+  if (MI->getOperand(3).getReg() != ZeroReg)
+    return false;
+
+  // Must only used by the user we combine with.
+  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+    return false;
+
+  return true;
+}
+
+/// hasPattern - return true when there is potentially a faster code sequence
+/// for an instruction chain ending in \p Root. All potential patterns are
+/// listed
+/// in the \p Pattern vector. Pattern should be sorted in priority order since
+/// the pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::hasPattern(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  if (!isCombineInstrCandidate(Opc))
+    return 0;
+  if (isCombineInstrSettingFlag(Opc)) {
+    int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
+    // When NZCV is live bail out.
+    if (Cmp_NZCV == -1)
+      return 0;
+    unsigned NewOpc = convertFlagSettingOpcode(&Root);
+    // When opcode can't change bail out.
+    // CHECKME: do we miss any cases for opcode conversion?
+    if (NewOpc == Opc)
+      return 0;
+    Opc = NewOpc;
+  }
+
+  switch (Opc) {
+  default:
+    break;
+  case AArch64::ADDWrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "ADDWrr does not have register operands");
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// genMadd - Generate madd instruction and combine mul and add.
+/// Example:
+///  MUL I=A,B,0
+///  ADD R,I,C
+///  ==> MADD R,A,B,C
+/// \param Root is the ADD instruction
+/// \param [out] InsInstr is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
+                             const TargetInstrInfo *TII, MachineInstr &Root,
+                             SmallVectorImpl<MachineInstr *> &InsInstrs,
+                             unsigned IdxMulOpd, unsigned MaddOpc) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  MachineOperand R = Root.getOperand(0);
+  MachineOperand A = MUL->getOperand(1);
+  MachineOperand B = MUL->getOperand(2);
+  MachineOperand C = Root.getOperand(IdxOtherOpd);
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc))
+                                .addOperand(R)
+                                .addOperand(A)
+                                .addOperand(B)
+                                .addOperand(C);
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// genMaddR - Generate madd instruction and combine mul and add using
+/// an extra virtual register
+/// Example - an ADD intermediate needs to be stored in a register:
+///   MUL I=A,B,0
+///   ADD R,I,Imm
+///   ==> ORR  V, ZR, Imm
+///   ==> MADD R,A,B,V
+/// \param Root is the ADD instruction
+/// \param [out] InsInstr is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+/// \param VR is a virtual register that holds the value of an ADD operand
+/// (V in the example above).
+static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo *TII, MachineInstr &Root,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs,
+                              unsigned IdxMulOpd, unsigned MaddOpc,
+                              unsigned VR) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  MachineOperand R = Root.getOperand(0);
+  MachineOperand A = MUL->getOperand(1);
+  MachineOperand B = MUL->getOperand(2);
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc))
+                                .addOperand(R)
+                                .addOperand(A)
+                                .addOperand(B)
+                                .addReg(VR);
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+/// genAlternativeCodeSequence - when hasPattern() finds a pattern
+/// this function generates the instructions that could replace the
+/// original code sequence
+void AArch64InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getTarget().getSubtargetImpl()->getInstrInfo();
+
+  MachineInstr *MUL;
+  unsigned Opc;
+  switch (Pattern) {
+  default:
+    // signal error.
+    break;
+  case MachineCombinerPattern::MC_MULADDW_OP1:
+  case MachineCombinerPattern::MC_MULADDX_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    Opc = Pattern == MachineCombinerPattern::MC_MULADDW_OP1 ? AArch64::MADDWrrr
+                                                            : AArch64::MADDXrrr;
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc);
+    break;
+  case MachineCombinerPattern::MC_MULADDW_OP2:
+  case MachineCombinerPattern::MC_MULADDX_OP2:
+    // MUL I=A,B,0
+    // ADD R,C,I
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    Opc = Pattern == MachineCombinerPattern::MC_MULADDW_OP2 ? AArch64::MADDWrrr
+                                                            : AArch64::MADDXrrr;
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc);
+    break;
+  case MachineCombinerPattern::MC_MULADDWI_OP1:
+  case MachineCombinerPattern::MC_MULADDXI_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,Imm
+    // ==> ORR  V, ZR, Imm
+    // ==> MADD R,A,B,V
+    // --- Create(MADD);
+    {
+      const TargetRegisterClass *RC =
+          MRI.getRegClass(Root.getOperand(1).getReg());
+      unsigned NewVR = MRI.createVirtualRegister(RC);
+      unsigned BitSize, OrrOpc, ZeroReg;
+      if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+        BitSize = 32;
+        OrrOpc = AArch64::ORRWri;
+        ZeroReg = AArch64::WZR;
+        Opc = AArch64::MADDWrrr;
+      } else {
+        OrrOpc = AArch64::ORRXri;
+        BitSize = 64;
+        ZeroReg = AArch64::XZR;
+        Opc = AArch64::MADDXrrr;
+      }
+      uint64_t Imm = Root.getOperand(2).getImm();
+
+      if (Root.getOperand(3).isImm()) {
+        unsigned val = Root.getOperand(3).getImm();
+        Imm = Imm << val;
+      }
+      uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+      uint64_t Encoding;
+
+      if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+        MachineInstrBuilder MIB1 =
+            BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc))
+                .addOperand(MachineOperand::CreateReg(NewVR, RegState::Define))
+                .addReg(ZeroReg)
+                .addImm(Encoding);
+        InsInstrs.push_back(MIB1);
+        InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+        MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR);
+      }
+    }
+    break;
+  case MachineCombinerPattern::MC_MULSUBW_OP1:
+  case MachineCombinerPattern::MC_MULSUBX_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, C
+    // ==> SUB  V, 0, C
+    // ==> MADD R,A,B,V // = -C + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *RC =
+        MRI.getRegClass(Root.getOperand(1).getReg());
+    unsigned NewVR = MRI.createVirtualRegister(RC);
+    unsigned SubOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+      SubOpc = AArch64::SUBWrr;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+    } else {
+      SubOpc = AArch64::SUBXrr;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+    }
+    // SUB NewVR, 0, C
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc))
+            .addOperand(MachineOperand::CreateReg(NewVR, RegState::Define))
+            .addReg(ZeroReg)
+            .addOperand(Root.getOperand(2));
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR);
+  } break;
+  case MachineCombinerPattern::MC_MULSUBW_OP2:
+  case MachineCombinerPattern::MC_MULSUBX_OP2:
+    // MUL I=A,B,0
+    // SUB R,C,I
+    // ==> MSUB R,A,B,C (computes C - A*B)
+    // --- Create(MSUB);
+    Opc = Pattern == MachineCombinerPattern::MC_MULSUBW_OP2 ? AArch64::MSUBWrrr
+                                                            : AArch64::MSUBXrrr;
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc);
+    break;
+  case MachineCombinerPattern::MC_MULSUBWI_OP1:
+  case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, Imm
+    // ==> ORR  V, ZR, -Imm
+    // ==> MADD R,A,B,V // = -Imm + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *RC =
+        MRI.getRegClass(Root.getOperand(1).getReg());
+    unsigned NewVR = MRI.createVirtualRegister(RC);
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+      BitSize = 32;
+      OrrOpc = AArch64::ORRWri;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+    }
+    int Imm = Root.getOperand(2).getImm();
+    if (Root.getOperand(3).isImm()) {
+      unsigned val = Root.getOperand(3).getImm();
+      Imm = Imm << val;
+    }
+    uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc))
+              .addOperand(MachineOperand::CreateReg(NewVR, RegState::Define))
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR);
+    }
+  } break;
+  }
+  // Record MUL and ADD/SUB for deletion
+  DelInstrs.push_back(MUL);
+  DelInstrs.push_back(&Root);
+
+  return;
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index d465fb757a0..87e78490c9d 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -17,6 +17,7 @@
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AArch64GenInstrInfo.inc"
@@ -156,9 +157,26 @@ public:
   bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
+  /// hasPattern - return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// listed
+  /// in the <Pattern> array.
+  virtual bool hasPattern(
+      MachineInstr &Root,
+      SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const;
+
+  /// genAlternativeCodeSequence - when hasPattern() finds a pattern
+  /// this function generates the instructions that could replace the
+  /// original code sequence
+  virtual void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
+  /// useMachineCombiner - AArch64 supports MachineCombiner
+  virtual bool useMachineCombiner(void) const;
 
   bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
 private:
   void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
                              MachineBasicBlock *TBB,
diff --git a/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
new file mode 100644
index 00000000000..568ce897360
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
@@ -0,0 +1,42 @@
+//===- AArch64MachineCombinerPattern.h                                    -===//
+//===- AArch64 instruction pattern supported by combiner                  -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines instruction pattern supported by combiner
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64MACHINECOMBINERPATTERN_H
+#define LLVM_TARGET_AArch64MACHINECOMBINERPATTERN_H
+
+namespace llvm {
+
+/// Enumeration of instruction pattern supported by machine combiner
+///
+///
+namespace MachineCombinerPattern {
+enum MC_PATTERN : int {
+  MC_NONE = 0,
+  MC_MULADDW_OP1 = 1,
+  MC_MULADDW_OP2 = 2,
+  MC_MULSUBW_OP1 = 3,
+  MC_MULSUBW_OP2 = 4,
+  MC_MULADDWI_OP1 = 5,
+  MC_MULSUBWI_OP1 = 6,
+  MC_MULADDX_OP1 = 7,
+  MC_MULADDX_OP2 = 8,
+  MC_MULSUBX_OP1 = 9,
+  MC_MULSUBX_OP2 = 10,
+  MC_MULADDXI_OP1 = 11,
+  MC_MULSUBXI_OP1 = 12
+};
+} // end namespace MachineCombinerPattern
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 2e9bb0cd823..9b01159ce32 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -24,6 +24,10 @@ static cl::opt<bool>
 EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
            cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableMCR("aarch64-mcr",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
 static cl::opt<bool>
 EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
                      cl::init(true), cl::Hidden);
@@ -174,6 +178,8 @@ bool AArch64PassConfig::addInstSelector() {
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCCMP)
     addPass(createAArch64ConditionalCompares());
+  if (EnableMCR)
+    addPass(&MachineCombinerID);
   addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
diff --git a/test/CodeGen/AArch64/madd-lohi.ll b/test/CodeGen/AArch64/madd-lohi.ll
new file mode 100644
index 00000000000..550a8cb24b7
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-lohi.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
+define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
+; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
+; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul x0, x0, x2
+
+; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
+; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
+; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul x1, x1, x3
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 550a8cb24b7..4515697b999 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,17 +1,16 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
-
+; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
 ; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
-; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul [[PART2:x[0-9]+]], x1, x2
 ; CHECK: mul x0, x0, x2
 
 ; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
 ; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
-; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
 ; CHECK-BE: mul x1, x1, x3
 
   %prod = mul i128 %lhs, %rhs
-- 
2.34.1