Get rid of a pseudo instruction and replace it with subreg based operation on real...

[oota-llvm.git] / lib / Target / X86 / X86ISelDAGToDAG.cpp
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index 144776e68c1f23cff0c280a183cd478cdc0d376d..bf233bfcbb459431c851f933662fdeab6819e8ed 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2,8 +2,8 @@
  //
  //                     The LLVM Compiler Infrastructure
  //
-// This file was developed by the Evan Cheng and is distributed under
-// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
  //
  //===----------------------------------------------------------------------===//
  //
@@ -16,6 +16,7 @@
  #include "X86.h"
  #include "X86InstrBuilder.h"
  #include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
  #include "X86RegisterInfo.h"
  #include "X86Subtarget.h"
  #include "X86TargetMachine.h"
@@ -23,23 +24,26 @@
  #include "llvm/Instructions.h"
  #include "llvm/Intrinsics.h"
  #include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
  #include "llvm/CodeGen/MachineConstantPool.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/SelectionDAGISel.h"
  #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Compiler.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/ADT/Statistic.h"
-#include <deque>
-#include <iostream>
  #include <queue>
  #include <set>
  using namespace llvm;
  
+STATISTIC(NumFPKill   , "Number of FP_REG_KILL instructions added");
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
  //===----------------------------------------------------------------------===//
  //                      Pattern Matcher Implementation
  //===----------------------------------------------------------------------===//
@@ -59,7 +63,7 @@ namespace {
        int FrameIndex;
      } Base;
  
-    bool isRIPRel;     // RIP relative?
+    bool isRIPRel;     // RIP as base?
      unsigned Scale;
      SDOperand IndexReg; 
      unsigned Disp;
@@ -77,12 +81,6 @@ namespace {
  }
  
  namespace {
-  Statistic<>
-  NumFPKill("x86-codegen", "Number of FP_REG_KILL instructions added");
-
-  Statistic<>
-  NumLoadMoved("x86-codegen", "Number of loads moved below TokenFactor");
-
    //===--------------------------------------------------------------------===//
    /// ISel - X86 specific code to select X86 machine instructions for
    /// SelectionDAG operations.
@@ -135,7 +133,7 @@ namespace {
  
      virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
  
-    virtual bool CanBeFoldedBy(SDNode *N, SDNode *U);
+    virtual bool CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root) const;
  
  // Include the pieces autogenerated from the target description.
  #include "X86GenDAGISel.inc"
@@ -143,15 +141,23 @@ namespace {
    private:
      SDNode *Select(SDOperand N);
  
-    bool MatchAddress(SDOperand N, X86ISelAddressMode &AM, bool isRoot = true);
-    bool SelectAddr(SDOperand N, SDOperand &Base, SDOperand &Scale,
-                    SDOperand &Index, SDOperand &Disp);
-    bool SelectLEAAddr(SDOperand N, SDOperand &Base, SDOperand &Scale,
-                       SDOperand &Index, SDOperand &Disp);
+    bool MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+                      bool isRoot = true, unsigned Depth = 0);
+    bool MatchAddressBase(SDOperand N, X86ISelAddressMode &AM,
+                          bool isRoot, unsigned Depth);
+    bool SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                    SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+    bool SelectLEAAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+    bool SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+                             SDOperand N, SDOperand &Base, SDOperand &Scale,
+                             SDOperand &Index, SDOperand &Disp,
+                             SDOperand &InChain, SDOperand &OutChain);
      bool TryFoldLoad(SDOperand P, SDOperand N,
                       SDOperand &Base, SDOperand &Scale,
                       SDOperand &Index, SDOperand &Disp);
-    void InstructionSelectPreprocess(SelectionDAG &DAG);
+    void PreprocessForRMW(SelectionDAG &DAG);
+    void PreprocessForFPConvert(SelectionDAG &DAG);
  
      /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
      /// inline asm expressions.
@@ -206,45 +212,77 @@ namespace {
      /// base register.  Return the virtual register that holds this value.
      SDNode *getGlobalBaseReg();
  
+    /// getTruncate - return an SDNode that implements a subreg based truncate
+    /// of the specified operand to the the specified value type.
+    SDNode *getTruncate(SDOperand N0, MVT::ValueType VT);
+
  #ifndef NDEBUG
      unsigned Indent;
  #endif
    };
  }
  
-static void findNonImmUse(SDNode* Use, SDNode* Def, bool &found,
+static SDNode *findFlagUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+      SDOperand Op = User->getOperand(i);
+      if (Op.Val == N && Op.ResNo == FlagResNo)
+        return User;
+    }
+  }
+  return NULL;
+}
+
+static void findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root, SDNode *Skip, bool &found,
                            std::set<SDNode *> &Visited) {
    if (found ||
        Use->getNodeId() > Def->getNodeId() ||
        !Visited.insert(Use).second)
      return;
  
-  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = Use->getNumOperands(); !found && i != e; ++i) {
      SDNode *N = Use->getOperand(i).Val;
-    if (N != Def) {
-      findNonImmUse(N, Def, found, Visited);
-    } else {
+    if (N == Skip)
+      continue;
+    if (N == Def) {
+      if (Use == ImmedUse)
+        continue; // Immediate use is ok.
+      if (Use == Root) {
+        assert(Use->getOpcode() == ISD::STORE ||
+               Use->getOpcode() == X86ISD::CMP);
+        continue;
+      }
        found = true;
        break;
      }
+    findNonImmUse(N, Def, ImmedUse, Root, Skip, found, Visited);
    }
  }
  
-static inline bool isNonImmUse(SDNode* Use, SDNode* Def) {
+/// isNonImmUse - Start searching from Root up the DAG to check is Def can
+/// be reached. Return true if that's the case. However, ignore direct uses
+/// by ImmedUse (which would be U in the example illustrated in
+/// CanBeFoldedBy) and by Root (which can happen in the store case).
+/// FIXME: to be really generic, we should allow direct use by any node
+/// that is being folded. But realisticly since we only fold loads which
+/// have one non-chain use, we only need to watch out for load/op/store
+/// and load/op/cmp case where the root (store / cmp) may reach the load via
+/// its chain operand.
+static inline bool isNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
+                               SDNode *Skip = NULL) {
    std::set<SDNode *> Visited;
    bool found = false;
-  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
-    SDNode *N = Use->getOperand(i).Val;
-    if (N != Def) {
-      findNonImmUse(N, Def, found, Visited);
-      if (found) break;
-    }
-  }
+  findNonImmUse(Root, Def, ImmedUse, Root, Skip, found, Visited);
    return found;
  }
  
  
-bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U) {
+bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root) const {
+  if (FastISel) return false;
+
    // If U use can somehow reach N through another path then U can't fold N or
    // it will create a cycle. e.g. In the following diagram, U can reach N
    // through X. If N is folded into into U, then X is both a predecessor and
@@ -257,7 +295,43 @@ bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U) {
    //      /        [X]
    //      |         ^
    //     [U]--------|
-  return !FastISel && !isNonImmUse(U, N);
+
+  if (isNonImmUse(Root, N, U))
+    return false;
+
+  // If U produces a flag, then it gets (even more) interesting. Since it
+  // would have been "glued" together with its flag use, we need to check if
+  // it might reach N:
+  //
+  //       [ N ]
+  //        ^ ^
+  //        | |
+  //       [U] \--
+  //        ^   [TF]
+  //        |    ^
+  //        |    |
+  //         \  /
+  //          [FU]
+  //
+  // If FU (flag use) indirectly reach N (the load), and U fold N (call it
+  // NU), then TF is a predecessor of FU and a successor of NU. But since
+  // NU and FU are flagged together, this effectively creates a cycle.
+  bool HasFlagUse = false;
+  MVT::ValueType VT = Root->getValueType(Root->getNumValues()-1);
+  while ((VT == MVT::Flag && !Root->use_empty())) {
+    SDNode *FU = findFlagUse(Root);
+    if (FU == NULL)
+      break;
+    else {
+      Root = FU;
+      HasFlagUse = true;
+    }
+    VT = Root->getValueType(Root->getNumValues()-1);
+  }
+
+  if (HasFlagUse)
+    return !isNonImmUse(Root, N, Root, U);
+  return true;
  }
  
  /// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
@@ -277,9 +351,10 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
                           Store.getOperand(2), Store.getOperand(3));
  }
  
-/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction
-/// selector to pick more load-modify-store instructions. This is a common
-/// case:
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// This is only run if not in -fast mode (aka -O0).
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
  ///
  ///     [Load chain]
  ///         ^
@@ -316,10 +391,10 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
  ///       \      /
  ///        \    /
  ///       [Store]
-void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
+void X86DAGToDAGISel::PreprocessForRMW(SelectionDAG &DAG) {
    for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
           E = DAG.allnodes_end(); I != E; ++I) {
-    if (I->getOpcode() != ISD::STORE)
+    if (!ISD::isNON_TRUNCStore(I))
        continue;
      SDOperand Chain = I->getOperand(0);
      if (Chain.Val->getOpcode() != ISD::TokenFactor)
@@ -345,13 +420,13 @@ void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
        case ISD::ADDE: {
          SDOperand N10 = N1.getOperand(0);
          SDOperand N11 = N1.getOperand(1);
-        if (N10.Val->getOpcode() == ISD::LOAD)
+        if (ISD::isNON_EXTLoad(N10.Val))
            RModW = true;
-        else if (N11.Val->getOpcode() == ISD::LOAD) {
+        else if (ISD::isNON_EXTLoad(N11.Val)) {
            RModW = true;
            std::swap(N10, N11);
          }
-        RModW = RModW && N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+        RModW = RModW && N10.Val->isOperandOf(Chain.Val) && N10.hasOneUse() &&
            (N10.getOperand(1) == N2) &&
            (N10.Val->getValueType(0) == N1.getValueType());
          if (RModW)
@@ -369,8 +444,8 @@ void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
        case X86ISD::SHLD:
        case X86ISD::SHRD: {
          SDOperand N10 = N1.getOperand(0);
-        if (N10.Val->getOpcode() == ISD::LOAD)
-          RModW = N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+        if (ISD::isNON_EXTLoad(N10.Val))
+          RModW = N10.Val->isOperandOf(Chain.Val) && N10.hasOneUse() &&
              (N10.getOperand(1) == N2) &&
              (N10.Val->getValueType(0) == N1.getValueType());
          if (RModW)
@@ -386,6 +461,71 @@ void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
    }
  }
  
+
+/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
+/// nodes that target the FP stack to be store and load to the stack.  This is a
+/// gross hack.  We would like to simply mark these as being illegal, but when
+/// we do that, legalize produces these when it expands calls, then expands
+/// these in the same legalize pass.  We would like dag combine to be able to
+/// hack on these between the call expansion and the node legalization.  As such
+/// this pass basically does "really late" legalization of these inline with the
+/// X86 isel pass.
+void X86DAGToDAGISel::PreprocessForFPConvert(SelectionDAG &DAG) {
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+    
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    MVT::ValueType SrcVT = N->getOperand(0).getValueType();
+    MVT::ValueType DstVT = N->getValueType(0);
+    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+   
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    MVT::ValueType MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+    
+    SDOperand MemTmp = DAG.CreateStackTemporary(MemVT);
+    
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDOperand Store = DAG.getTruncStore(DAG.getEntryNode(), N->getOperand(0),
+                                        MemTmp, NULL, 0, MemVT);
+    SDOperand Result = DAG.getExtLoad(ISD::EXTLOAD, DstVT, Store, MemTmp,
+                                      NULL, 0, MemVT);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    DAG.ReplaceAllUsesOfValueWith(SDOperand(N, 0), Result);
+    
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    DAG.DeleteNode(N);
+  }  
+}
+
  /// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
  /// when it has created a SelectionDAG for us to codegen.
  void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
@@ -393,52 +533,74 @@ void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
    MachineFunction::iterator FirstMBB = BB;
  
    if (!FastISel)
-    InstructionSelectPreprocess(DAG);
+    PreprocessForRMW(DAG);
+
+  // FIXME: This should only happen when not -fast.
+  PreprocessForFPConvert(DAG);
  
    // Codegen the basic block.
  #ifndef NDEBUG
-  DEBUG(std::cerr << "===== Instruction selection begins:\n");
+  DOUT << "===== Instruction selection begins:\n";
    Indent = 0;
  #endif
    DAG.setRoot(SelectRoot(DAG.getRoot()));
  #ifndef NDEBUG
-  DEBUG(std::cerr << "===== Instruction selection ends:\n");
+  DOUT << "===== Instruction selection ends:\n";
  #endif
  
    DAG.RemoveDeadNodes();
  
-  // Emit machine code to BB. 
+  // Emit machine code to BB.  This can change 'BB' to the last block being 
+  // inserted into.
    ScheduleAndEmitDAG(DAG);
    
    // If we are emitting FP stack code, scan the basic block to determine if this
    // block defines any FP values.  If so, put an FP_REG_KILL instruction before
    // the terminator of the block.
-  if (!Subtarget->hasSSE2()) {
-    // Note that FP stack instructions *are* used in SSE code when returning
-    // values, but these are not live out of the basic block, so we don't need
-    // an FP_REG_KILL in this case either.
-    bool ContainsFPCode = false;
+
+  // Note that FP stack instructions are used in all modes for long double,
+  // so we always need to do this check.
+  // Also note that it's possible for an FP stack register to be live across
+  // an instruction that produces multiple basic blocks (SSE CMOV) so we
+  // must check all the generated basic blocks.
+
+  // Scan all of the machine instructions in these MBBs, checking for FP
+  // stores.  (RFP32 and RFP64 will not exist in SSE mode, but RFP80 might.)
+  MachineFunction::iterator MBBI = FirstMBB;
+  MachineFunction::iterator EndMBB = BB; ++EndMBB;
+  for (; MBBI != EndMBB; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    
+    // If this block returns, ignore it.  We don't want to insert an FP_REG_KILL
+    // before the return.
+    if (!MBB->empty()) {
+      MachineBasicBlock::iterator EndI = MBB->end();
+      --EndI;
+      if (EndI->getDesc().isReturn())
+        continue;
+    }
      
-    // Scan all of the machine instructions in these MBBs, checking for FP
-    // stores.
-    MachineFunction::iterator MBBI = FirstMBB;
-    do {
-      for (MachineBasicBlock::iterator I = MBBI->begin(), E = MBBI->end();
-           !ContainsFPCode && I != E; ++I) {
+    bool ContainsFPCode = false;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         !ContainsFPCode && I != E; ++I) {
+      if (I->getNumOperands() != 0 && I->getOperand(0).isRegister()) {
+        const TargetRegisterClass *clas;
          for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
            if (I->getOperand(op).isRegister() && I->getOperand(op).isDef() &&
-              MRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
-              RegMap->getRegClass(I->getOperand(0).getReg()) == 
-                X86::RFPRegisterClass) {
+            TargetRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+              ((clas = RegInfo->getRegClass(I->getOperand(0).getReg())) == 
+                 X86::RFP32RegisterClass ||
+               clas == X86::RFP64RegisterClass ||
+               clas == X86::RFP80RegisterClass)) {
              ContainsFPCode = true;
              break;
            }
          }
        }
-    } while (!ContainsFPCode && &*(MBBI++) != BB);
-    
+    }
      // Check PHI nodes in successor blocks.  These PHI's will be lowered to have
-    // a copy of the input value in this block.
+    // a copy of the input value in this block.  In SSE mode, we only care about
+    // 80-bit values.
      if (!ContainsFPCode) {
        // Final check, check LLVM BB's that are successors to the LLVM BB
        // corresponding to BB for FP PHI nodes.
@@ -448,17 +610,19 @@ void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
             !ContainsFPCode && SI != E; ++SI) {
          for (BasicBlock::const_iterator II = SI->begin();
               (PN = dyn_cast<PHINode>(II)); ++II) {
-          if (PN->getType()->isFloatingPoint()) {
+          if (PN->getType()==Type::X86_FP80Ty ||
+              (!Subtarget->hasSSE1() && PN->getType()->isFloatingPoint()) ||
+              (!Subtarget->hasSSE2() && PN->getType()==Type::DoubleTy)) {
              ContainsFPCode = true;
              break;
            }
          }
        }
      }
-
      // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
      if (ContainsFPCode) {
-      BuildMI(*BB, BB->getFirstTerminator(), X86::FP_REG_KILL, 0);
+      BuildMI(*MBB, MBBI->getFirstTerminator(),
+              TM.getInstrInfo()->get(X86::FP_REG_KILL));
        ++NumFPKill;
      }
    }
@@ -468,19 +632,9 @@ void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
  /// the main function.
  void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
                                               MachineFrameInfo *MFI) {
-  if (Subtarget->TargetType == X86Subtarget::isCygwin)
-    BuildMI(BB, X86::CALLpcrel32, 1).addExternalSymbol("__main");
-
-  // Switch the FPU to 64-bit precision mode for better compatibility and speed.
-  int CWFrameIdx = MFI->CreateStackObject(2, 2);
-  addFrameReference(BuildMI(BB, X86::FNSTCW16m, 4), CWFrameIdx);
-
-  // Set the high part to be 64-bit precision.
-  addFrameReference(BuildMI(BB, X86::MOV8mi, 5),
-                    CWFrameIdx, 1).addImm(2);
-
-  // Reload the modified control word now.
-  addFrameReference(BuildMI(BB, X86::FLDCW16m, 4), CWFrameIdx);
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing())
+    BuildMI(BB, TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
  }
  
  void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
@@ -492,9 +646,13 @@ void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
  
  /// MatchAddress - Add the specified node to the specified addressing mode,
  /// returning true if it cannot be done.  This just pattern matches for the
-/// addressing mode
+/// addressing mode.
  bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
-                                   bool isRoot) {
+                                   bool isRoot, unsigned Depth) {
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM, isRoot, Depth);
+  
    // RIP relative addressing: %rip + 32-bit displacement!
    if (AM.isRIPRel) {
      if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
@@ -508,7 +666,7 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
    }
  
    int id = N.Val->getNodeId();
-  bool Available = isSelected(id);
+  bool AlreadySelected = isSelected(id); // Already selected, not yet replaced.
  
    switch (N.getOpcode()) {
    default: break;
@@ -521,56 +679,48 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
      break;
    }
  
-  case X86ISD::Wrapper:
+  case X86ISD::Wrapper: {
+    bool is64Bit = Subtarget->is64Bit();
+    // Under X86-64 non-small code model, GV (and friends) are 64-bits.
+    // Also, base and index reg must be 0 in order to use rip as base.
+    if (is64Bit && (TM.getCodeModel() != CodeModel::Small ||
+                    AM.Base.Reg.Val || AM.IndexReg.Val))
+      break;
+    if (AM.GV != 0 || AM.CP != 0 || AM.ES != 0 || AM.JT != -1)
+      break;
      // If value is available in a register both base and index components have
      // been picked, we can't fit the result available in the register in the
      // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
-
-    // Can't fit GV or CP in addressing mode for X86-64 medium or large code
-    // model since the displacement field is 32-bit. Ok for small code model.
-
-    // For X86-64 PIC code, only allow GV / CP + displacement so we can use RIP
-    // relative addressing mode.
-    if ((!Subtarget->is64Bit() || TM.getCodeModel() == CodeModel::Small) &&
-        (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val))) {
-      bool isRIP = Subtarget->is64Bit();
-      if (isRIP && (AM.Base.Reg.Val || AM.Scale > 1 || AM.IndexReg.Val ||
-                    AM.BaseType == X86ISelAddressMode::FrameIndexBase))
-        break;
-      if (ConstantPoolSDNode *CP =
-          dyn_cast<ConstantPoolSDNode>(N.getOperand(0))) {
-        if (AM.CP == 0) {
-          AM.CP = CP->getConstVal();
-          AM.Align = CP->getAlignment();
-          AM.Disp += CP->getOffset();
-          if (isRIP)
-            AM.isRIPRel = true;
-          return false;
-        }
-      } else if (GlobalAddressSDNode *G =
-                 dyn_cast<GlobalAddressSDNode>(N.getOperand(0))) {
-        if (AM.GV == 0) {
-          AM.GV = G->getGlobal();
-          AM.Disp += G->getOffset();
-          if (isRIP)
-            AM.isRIPRel = true;
-          return false;
-        }
-      } else if (isRoot && isRIP) {
-        if (ExternalSymbolSDNode *S =
-            dyn_cast<ExternalSymbolSDNode>(N.getOperand(0))) {
-          AM.ES = S->getSymbol();
-          AM.isRIPRel = true;
-          return false;
-        } else if (JumpTableSDNode *J =
-                   dyn_cast<JumpTableSDNode>(N.getOperand(0))) {
-          AM.JT = J->getIndex();
-          AM.isRIPRel = true;
-          return false;
-        }
+    if (!AlreadySelected || (AM.Base.Reg.Val && AM.IndexReg.Val)) {
+      SDOperand N0 = N.getOperand(0);
+      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+        GlobalValue *GV = G->getGlobal();
+        AM.GV = GV;
+        AM.Disp += G->getOffset();
+        AM.isRIPRel = TM.getRelocationModel() != Reloc::Static &&
+          Subtarget->isPICStyleRIPRel();
+        return false;
+      } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+        AM.CP = CP->getConstVal();
+        AM.Align = CP->getAlignment();
+        AM.Disp += CP->getOffset();
+        AM.isRIPRel = TM.getRelocationModel() != Reloc::Static &&
+          Subtarget->isPICStyleRIPRel();
+        return false;
+      } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) {
+        AM.ES = S->getSymbol();
+        AM.isRIPRel = TM.getRelocationModel() != Reloc::Static &&
+          Subtarget->isPICStyleRIPRel();
+        return false;
+      } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+        AM.JT = J->getIndex();
+        AM.isRIPRel = TM.getRelocationModel() != Reloc::Static &&
+          Subtarget->isPICStyleRIPRel();
+        return false;
        }
      }
      break;
+  }
  
    case ISD::FrameIndex:
      if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base.Reg.Val == 0) {
@@ -581,40 +731,48 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
      break;
  
    case ISD::SHL:
-    if (!Available && AM.IndexReg.Val == 0 && AM.Scale == 1)
-      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) {
-        unsigned Val = CN->getValue();
-        if (Val == 1 || Val == 2 || Val == 3) {
-          AM.Scale = 1 << Val;
-          SDOperand ShVal = N.Val->getOperand(0);
-
-          // Okay, we know that we have a scale by now.  However, if the scaled
-          // value is an add of something and a constant, we can fold the
-          // constant into the disp field here.
-          if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
-              isa<ConstantSDNode>(ShVal.Val->getOperand(1))) {
-            AM.IndexReg = ShVal.Val->getOperand(0);
-            ConstantSDNode *AddVal =
-              cast<ConstantSDNode>(ShVal.Val->getOperand(1));
-            uint64_t Disp = AM.Disp + AddVal->getValue() << Val;
-            if (isInt32(Disp))
-              AM.Disp = Disp;
-            else
-              AM.IndexReg = ShVal;
-          } else {
+    if (AlreadySelected || AM.IndexReg.Val != 0 || AM.Scale != 1 || AM.isRIPRel)
+      break;
+      
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) {
+      unsigned Val = CN->getValue();
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDOperand ShVal = N.Val->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
+            isa<ConstantSDNode>(ShVal.Val->getOperand(1))) {
+          AM.IndexReg = ShVal.Val->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.Val->getOperand(1));
+          uint64_t Disp = AM.Disp + (AddVal->getValue() << Val);
+          if (isInt32(Disp))
+            AM.Disp = Disp;
+          else
              AM.IndexReg = ShVal;
-          }
-          return false;
+        } else {
+          AM.IndexReg = ShVal;
          }
+        return false;
        }
      break;
+    }
  
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.ResNo != 0) break;
+    // FALL THROUGH
    case ISD::MUL:
      // X*[3,5,9] -> X+X*[2,4,8]
-    if (!Available &&
+    if (!AlreadySelected &&
          AM.BaseType == X86ISelAddressMode::RegBase &&
          AM.Base.Reg.Val == 0 &&
-        AM.IndexReg.Val == 0)
+        AM.IndexReg.Val == 0 &&
+        !AM.isRIPRel) {
        if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1)))
          if (CN->getValue() == 3 || CN->getValue() == 5 || CN->getValue() == 9) {
            AM.Scale = unsigned(CN->getValue())-1;
@@ -642,52 +800,97 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
            AM.IndexReg = AM.Base.Reg = Reg;
            return false;
          }
+    }
      break;
  
-  case ISD::ADD: {
-    if (!Available) {
+  case ISD::ADD:
+    if (!AlreadySelected) {
        X86ISelAddressMode Backup = AM;
-      if (!MatchAddress(N.Val->getOperand(0), AM, false) &&
-          !MatchAddress(N.Val->getOperand(1), AM, false))
+      if (!MatchAddress(N.Val->getOperand(0), AM, false, Depth+1) &&
+          !MatchAddress(N.Val->getOperand(1), AM, false, Depth+1))
          return false;
        AM = Backup;
-      if (!MatchAddress(N.Val->getOperand(1), AM, false) &&
-          !MatchAddress(N.Val->getOperand(0), AM, false))
+      if (!MatchAddress(N.Val->getOperand(1), AM, false, Depth+1) &&
+          !MatchAddress(N.Val->getOperand(0), AM, false, Depth+1))
          return false;
        AM = Backup;
      }
      break;
-  }
  
-  case ISD::OR: {
-    if (!Available) {
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (AlreadySelected) break;
+      
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
        X86ISelAddressMode Backup = AM;
-      // Look for (x << c1) | c2 where (c2 < c1)
-      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(0));
-      if (CN && !MatchAddress(N.Val->getOperand(1), AM, false)) {
-        if (AM.GV == NULL && AM.Disp == 0 && CN->getValue() < AM.Scale) {
-          AM.Disp = CN->getValue();
-          return false;
-        }
-      }
-      AM = Backup;
-      CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1));
-      if (CN && !MatchAddress(N.Val->getOperand(0), AM, false)) {
-        if (AM.GV == NULL && AM.Disp == 0 && CN->getValue() < AM.Scale) {
-          AM.Disp = CN->getValue();
-          return false;
-        }
+      // Start with the LHS as an addr mode.
+      if (!MatchAddress(N.getOperand(0), AM, false) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == NULL &&
+          // On x86-64, the resultant disp must fit in 32-bits.
+          isInt32(AM.Disp + CN->getSignExtended()) &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += CN->getValue();
+        return false;
        }
        AM = Backup;
      }
      break;
+      
+  case ISD::AND: {
+    // Handle "(x << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+    // allows us to fold the shift into this addressing mode.
+    if (AlreadySelected) break;
+    SDOperand Shift = N.getOperand(0);
+    if (Shift.getOpcode() != ISD::SHL) break;
+    
+    // Scale must not be used already.
+    if (AM.IndexReg.Val != 0 || AM.Scale != 1) break;
+
+    // Not when RIP is used as the base.
+    if (AM.isRIPRel) break;
+      
+    ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+    if (!C1 || !C2) break;
+
+    // Not likely to be profitable if either the AND or SHIFT node has more
+    // than one use (unless all uses are for address computation). Besides,
+    // isel mechanism requires their node ids to be reused.
+    if (!N.hasOneUse() || !Shift.hasOneUse())
+      break;
+    
+    // Verify that the shift amount is something we can fold.
+    unsigned ShiftCst = C1->getValue();
+    if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3)
+      break;
+    
+    // Get the new AND mask, this folds to a constant.
+    SDOperand NewANDMask = CurDAG->getNode(ISD::SRL, N.getValueType(),
+                                           SDOperand(C2, 0), SDOperand(C1, 0));
+    SDOperand NewAND = CurDAG->getNode(ISD::AND, N.getValueType(),
+                                       Shift.getOperand(0), NewANDMask);
+    NewANDMask.Val->setNodeId(Shift.Val->getNodeId());
+    NewAND.Val->setNodeId(N.Val->getNodeId());
+    
+    AM.Scale = 1 << ShiftCst;
+    AM.IndexReg = NewAND;
+    return false;
    }
    }
  
+  return MatchAddressBase(N, AM, isRoot, Depth);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDOperand N, X86ISelAddressMode &AM,
+                                       bool isRoot, unsigned Depth) {
    // Is the base register already occupied?
    if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.Val) {
      // If so, check to see if the scale index register is set.
-    if (AM.IndexReg.Val == 0) {
+    if (AM.IndexReg.Val == 0 && !AM.isRIPRel) {
        AM.IndexReg = N;
        AM.Scale = 1;
        return false;
@@ -706,8 +909,9 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
  /// SelectAddr - returns true if it is able pattern match an addressing mode.
  /// It returns the operands which make up the maximal addressing mode it can
  /// match by reference.
-bool X86DAGToDAGISel::SelectAddr(SDOperand N, SDOperand &Base, SDOperand &Scale,
-                                 SDOperand &Index, SDOperand &Disp) {
+bool X86DAGToDAGISel::SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                                 SDOperand &Scale, SDOperand &Index,
+                                 SDOperand &Disp) {
    X86ISelAddressMode AM;
    if (MatchAddress(N, AM))
      return false;
@@ -725,10 +929,81 @@ bool X86DAGToDAGISel::SelectAddr(SDOperand N, SDOperand &Base, SDOperand &Scale,
    return true;
  }
  
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+  cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+  (isa<ConstantFPSDNode>(Elt) &&
+  cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+}
+
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+                                          SDOperand N, SDOperand &Base,
+                                          SDOperand &Scale, SDOperand &Index,
+                                          SDOperand &Disp, SDOperand &InChain,
+                                          SDOperand &OutChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    InChain = N.getOperand(0).getValue(1);
+    if (ISD::isNON_EXTLoad(InChain.Val) &&
+        InChain.getValue(0).hasOneUse() &&
+        N.hasOneUse() &&
+        CanBeFoldedBy(N.Val, Pred.Val, Op.Val)) {
+      LoadSDNode *LD = cast<LoadSDNode>(InChain);
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+        return false;
+      OutChain = LD->getChain();
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      ISD::isBuildVectorAllZeros(N.getOperand(0).Val) &&
+      N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(1).Val->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
+      N.getOperand(1).getOperand(0).hasOneUse()) {
+    // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
+    // from the LHS.
+    unsigned VecWidth=MVT::getVectorNumElements(N.getOperand(0).getValueType());
+    SDOperand ShufMask = N.getOperand(2);
+    assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
+      if (C->getValue() == VecWidth) {
+        for (unsigned i = 1; i != VecWidth; ++i) {
+          if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
+            // ok.
+          } else {
+            ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i));
+            if (C->getValue() >= VecWidth) return false;
+          }
+        }
+      }
+      
+      // Okay, this is a zero extending load.  Fold it.
+      LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0));
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+        return false;
+      OutChain = LD->getChain();
+      InChain = SDOperand(LD, 1);
+      return true;
+    }
+  }
+  return false;
+}
+
+
  /// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
  /// mode it matches can be cost effectively emitted as an LEA instruction.
-bool X86DAGToDAGISel::SelectLEAAddr(SDOperand N, SDOperand &Base,
-                                    SDOperand &Scale,
+bool X86DAGToDAGISel::SelectLEAAddr(SDOperand Op, SDOperand N,
+                                    SDOperand &Base, SDOperand &Scale,
                                      SDOperand &Index, SDOperand &Disp) {
    X86ISelAddressMode AM;
    if (MatchAddress(N, AM))
@@ -749,10 +1024,9 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDOperand N, SDOperand &Base,
    else
      AM.IndexReg = CurDAG->getRegister(0, VT);
  
-  if (AM.Scale > 2) 
-    Complexity += 2;
-  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg
-  else if (AM.Scale > 1)
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
      Complexity++;
  
    // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
@@ -782,16 +1056,10 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDOperand N, SDOperand &Base,
  bool X86DAGToDAGISel::TryFoldLoad(SDOperand P, SDOperand N,
                                    SDOperand &Base, SDOperand &Scale,
                                    SDOperand &Index, SDOperand &Disp) {
-  if (N.getOpcode() == ISD::LOAD &&
+  if (ISD::isNON_EXTLoad(N.Val) &&
        N.hasOneUse() &&
-      CanBeFoldedBy(N.Val, P.Val))
-    return SelectAddr(N.getOperand(1), Base, Scale, Index, Disp);
-  return false;
-}
-
-static bool isRegister0(SDOperand Op) {
-  if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op))
-    return (R->getReg() == 0);
+      CanBeFoldedBy(N.Val, P.Val, P.Val))
+    return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp);
    return false;
  }
  
@@ -802,14 +1070,28 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
    assert(!Subtarget->is64Bit() && "X86-64 PIC uses RIP relative addressing");
    if (!GlobalBaseReg) {
      // Insert the set of GlobalBaseReg into the first MBB of the function
-    MachineBasicBlock &FirstMBB = BB->getParent()->front();
+    MachineFunction *MF = BB->getParent();
+    MachineBasicBlock &FirstMBB = MF->front();
      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-    SSARegMap *RegMap = BB->getParent()->getSSARegMap();
-    // FIXME: when we get to LP64, we will need to create the appropriate
-    // type of register here.
-    GlobalBaseReg = RegMap->createVirtualRegister(X86::GR32RegisterClass);
-    BuildMI(FirstMBB, MBBI, X86::MovePCtoStack, 0);
-    BuildMI(FirstMBB, MBBI, X86::POP32r, 1, GlobalBaseReg);
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+    
+    const TargetInstrInfo *TII = TM.getInstrInfo();
+    // Operand of MovePCtoStack is completely ignored by asm printer. It's
+    // only used in JIT code emission as displacement to pc.
+    BuildMI(FirstMBB, MBBI, TII->get(X86::MOVPC32r), PC).addImm(0);
+    
+    // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+    // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        Subtarget->isPICStyleGOT()) {
+      GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+      BuildMI(FirstMBB, MBBI, TII->get(X86::ADD32ri), GlobalBaseReg)
+        .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+    } else {
+      GlobalBaseReg = PC;
+    }
+    
    }
    return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).Val;
  }
@@ -821,6 +1103,43 @@ static SDNode *FindCallStartFromCall(SDNode *Node) {
    return FindCallStartFromCall(Node->getOperand(0).Val);
  }
  
+SDNode *X86DAGToDAGISel::getTruncate(SDOperand N0, MVT::ValueType VT) {
+    SDOperand SRIdx;
+    switch (VT) {
+    case MVT::i8:
+      SRIdx = CurDAG->getTargetConstant(1, MVT::i32); // SubRegSet 1
+      // Ensure that the source register has an 8-bit subreg on 32-bit targets
+      if (!Subtarget->is64Bit()) { 
+        unsigned Opc;
+        MVT::ValueType VT;
+        switch (N0.getValueType()) {
+        default: assert(0 && "Unknown truncate!");
+        case MVT::i16:
+          Opc = X86::MOV16to16_;
+          VT = MVT::i16;
+          break;
+        case MVT::i32:
+          Opc = X86::MOV32to32_;
+          VT = MVT::i32;
+          break;
+        }
+        N0 = SDOperand(CurDAG->getTargetNode(Opc, VT, MVT::Flag, N0), 0);
+        return CurDAG->getTargetNode(X86::EXTRACT_SUBREG,
+                                     VT, N0, SRIdx, N0.getValue(1));
+      }
+      break;
+    case MVT::i16:
+      SRIdx = CurDAG->getTargetConstant(2, MVT::i32); // SubRegSet 2
+      break;
+    case MVT::i32:
+      SRIdx = CurDAG->getTargetConstant(3, MVT::i32); // SubRegSet 3
+      break;
+    default: assert(0 && "Unknown truncate!"); break;
+    }
+    return CurDAG->getTargetNode(X86::EXTRACT_SUBREG, VT, N0, SRIdx);
+}
+
+
  SDNode *X86DAGToDAGISel::Select(SDOperand N) {
    SDNode *Node = N.Val;
    MVT::ValueType NVT = Node->getValueType(0);
@@ -828,19 +1147,17 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
    unsigned Opcode = Node->getOpcode();
  
  #ifndef NDEBUG
-  DEBUG(std::cerr << std::string(Indent, ' '));
-  DEBUG(std::cerr << "Selecting: ");
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
    DEBUG(Node->dump(CurDAG));
-  DEBUG(std::cerr << "\n");
+  DOUT << "\n";
    Indent += 2;
  #endif
  
    if (Opcode >= ISD::BUILTIN_OP_END && Opcode < X86ISD::FIRST_NUMBER) {
  #ifndef NDEBUG
-    DEBUG(std::cerr << std::string(Indent-2, ' '));
-    DEBUG(std::cerr << "== ");
+    DOUT << std::string(Indent-2, ' ') << "== ";
      DEBUG(Node->dump(CurDAG));
-    DEBUG(std::cerr << "\n");
+    DOUT << "\n";
      Indent -= 2;
  #endif
      return NULL;   // Already selected.
@@ -851,11 +1168,63 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
      case X86ISD::GlobalBaseReg: 
        return getGlobalBaseReg();
  
+    // FIXME: This is a workaround for a tblgen problem: rdar://5791600
+    case X86ISD::RET_FLAG:
+      if (ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+        if (Amt->getSignExtended() != 0) break;
+        
+        // Match (X86retflag 0).
+        SDOperand Chain = N.getOperand(0);
+        bool HasInFlag = N.getOperand(N.getNumOperands()-1).getValueType()
+                          == MVT::Flag;
+        SmallVector<SDOperand, 8> Ops0;
+        AddToISelQueue(Chain);
+        SDOperand InFlag(0, 0);
+        if (HasInFlag) {
+          InFlag = N.getOperand(N.getNumOperands()-1);
+          AddToISelQueue(InFlag);
+        }
+        for (unsigned i = 2, e = N.getNumOperands()-(HasInFlag?1:0); i != e;
+             ++i) {
+          AddToISelQueue(N.getOperand(i));
+          Ops0.push_back(N.getOperand(i));
+        }
+        Ops0.push_back(Chain);
+        if (HasInFlag)
+          Ops0.push_back(InFlag);
+        return CurDAG->getTargetNode(X86::RET, MVT::Other,
+                                     &Ops0[0], Ops0.size());
+      }
+      break;
+      
+    case X86ISD::FP_GET_ST0_ST1: {
+      SDOperand Chain = N.getOperand(0);
+      SDOperand InFlag = N.getOperand(1);
+      AddToISelQueue(Chain);
+      AddToISelQueue(InFlag);
+      std::vector<MVT::ValueType> Tys;
+      Tys.push_back(MVT::f80);
+      Tys.push_back(MVT::f80);
+      Tys.push_back(MVT::Other);
+      Tys.push_back(MVT::Flag);
+      SDOperand Ops[] = { Chain, InFlag };
+      SDNode *ResNode = CurDAG->getTargetNode(X86::FpGET_ST0_ST1, Tys,
+                                              Ops, 2);
+      Chain = SDOperand(ResNode, 2);
+      InFlag = SDOperand(ResNode, 3);
+      ReplaceUses(SDOperand(N.Val, 2), Chain);
+      ReplaceUses(SDOperand(N.Val, 3), InFlag);
+      return ResNode;
+    }
+
      case ISD::ADD: {
        // Turn ADD X, c to MOV32ri X+c. This cannot be done with tblgen'd
        // code and is matched first so to prevent it from being turned into
        // LEA32r X+c.
-      // In 64-bit mode, use LEA to take advantage of RIP-relative addressing.
+      // In 64-bit small code size mode, use LEA to take advantage of
+      // RIP-relative addressing.
+      if (TM.getCodeModel() != CodeModel::Small)
+        break;
        MVT::ValueType PtrVT = TLI.getPointerTy();
        SDOperand N0 = N.getOperand(0);
        SDOperand N1 = N.getOperand(1);
@@ -890,9 +1259,13 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        break;
      }
  
-    case ISD::MULHU:
-    case ISD::MULHS: {
-      if (Opcode == ISD::MULHU)
+    case ISD::SMUL_LOHI:
+    case ISD::UMUL_LOHI: {
+      SDOperand N0 = Node->getOperand(0);
+      SDOperand N1 = Node->getOperand(1);
+
+      bool isSigned = Opcode == ISD::SMUL_LOHI;
+      if (!isSigned)
          switch (NVT) {
          default: assert(0 && "Unsupported VT!");
          case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
@@ -918,71 +1291,90 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
        }
  
-      SDOperand N0 = Node->getOperand(0);
-      SDOperand N1 = Node->getOperand(1);
-
-      bool foldedLoad = false;
        SDOperand Tmp0, Tmp1, Tmp2, Tmp3;
-      foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
-      // MULHU and MULHS are commmutative
+      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+      // multiplty is commmutative
        if (!foldedLoad) {
          foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3);
-        if (foldedLoad) {
-          N0 = Node->getOperand(1);
-          N1 = Node->getOperand(0);
-        }
+        if (foldedLoad)
+          std::swap(N0, N1);
        }
  
-      SDOperand Chain;
-      if (foldedLoad) {
-        Chain = N1.getOperand(0);
-        AddToISelQueue(Chain);
-      } else
-        Chain = CurDAG->getEntryNode();
-
-      SDOperand InFlag(0, 0);
        AddToISelQueue(N0);
-      Chain  = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(LoReg, NVT),
-                                    N0, InFlag);
-      InFlag = Chain.getValue(1);
+      SDOperand InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), LoReg,
+                                              N0, SDOperand()).getValue(1);
  
        if (foldedLoad) {
+        AddToISelQueue(N1.getOperand(0));
          AddToISelQueue(Tmp0);
          AddToISelQueue(Tmp1);
          AddToISelQueue(Tmp2);
          AddToISelQueue(Tmp3);
-        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Chain, InFlag };
+        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N1.getOperand(0), InFlag };
          SDNode *CNode =
            CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
-        Chain  = SDOperand(CNode, 0);
          InFlag = SDOperand(CNode, 1);
+        // Update the chain.
+        ReplaceUses(N1.getValue(1), SDOperand(CNode, 0));
        } else {
          AddToISelQueue(N1);
          InFlag =
            SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
        }
  
-      SDOperand Result = CurDAG->getCopyFromReg(Chain, HiReg, NVT, InFlag);
-      ReplaceUses(N.getValue(0), Result);
-      if (foldedLoad)
-        ReplaceUses(N1.getValue(1), Result.getValue(1));
+      // Copy the low half of the result, if it is needed.
+      if (!N.getValue(0).use_empty()) {
+        SDOperand Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                                  LoReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
+        ReplaceUses(N.getValue(0), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.Val->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
+      // Copy the high half of the result, if it is needed.
+      if (!N.getValue(1).use_empty()) {
+        SDOperand Result;
+        if (HiReg == X86::AH && Subtarget->is64Bit()) {
+          // Prevent use of AH in a REX instruction by referencing AX instead.
+          // Shift it down 8 bits.
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                          X86::AX, MVT::i16, InFlag);
+          InFlag = Result.getValue(2);
+          Result = SDOperand(CurDAG->getTargetNode(X86::SHR16ri, MVT::i16, Result,
+                                       CurDAG->getTargetConstant(8, MVT::i8)), 0);
+          // Then truncate it down to i8.
+          SDOperand SRIdx = CurDAG->getTargetConstant(1, MVT::i32); // SubRegSet 1
+          Result = SDOperand(CurDAG->getTargetNode(X86::EXTRACT_SUBREG,
+                                                   MVT::i8, Result, SRIdx), 0);
+        } else {
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                          HiReg, NVT, InFlag);
+          InFlag = Result.getValue(2);
+        }
+        ReplaceUses(N.getValue(1), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.Val->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
  
  #ifndef NDEBUG
-      DEBUG(std::cerr << std::string(Indent-2, ' '));
-      DEBUG(std::cerr << "=> ");
-      DEBUG(Result.Val->dump(CurDAG));
-      DEBUG(std::cerr << "\n");
        Indent -= 2;
  #endif
+
        return NULL;
      }
        
-    case ISD::SDIV:
-    case ISD::UDIV:
-    case ISD::SREM:
-    case ISD::UREM: {
-      bool isSigned = Opcode == ISD::SDIV || Opcode == ISD::SREM;
-      bool isDiv    = Opcode == ISD::SDIV || Opcode == ISD::UDIV;
+    case ISD::SDIVREM:
+    case ISD::UDIVREM: {
+      SDOperand N0 = Node->getOperand(0);
+      SDOperand N1 = Node->getOperand(1);
+
+      bool isSigned = Opcode == ISD::SDIVREM;
        if (!isSigned)
          switch (NVT) {
          default: assert(0 && "Unsupported VT!");
@@ -1006,7 +1398,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        default: assert(0 && "Unsupported VT!");
        case MVT::i8:
          LoReg = X86::AL;  HiReg = X86::AH;
-        ClrOpcode  = X86::MOV8r0;
+        ClrOpcode  = 0;
          SExtOpcode = X86::CBW;
          break;
        case MVT::i16:
@@ -1026,103 +1418,209 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
          break;
        }
  
-      SDOperand N0 = Node->getOperand(0);
-      SDOperand N1 = Node->getOperand(1);
-
-      bool foldedLoad = false;
        SDOperand Tmp0, Tmp1, Tmp2, Tmp3;
-      foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
-      SDOperand Chain;
-      if (foldedLoad) {
-        Chain = N1.getOperand(0);
-        AddToISelQueue(Chain);
-      } else
-        Chain = CurDAG->getEntryNode();
-
-      SDOperand InFlag(0, 0);
-      AddToISelQueue(N0);
-      Chain  = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(LoReg, NVT),
-                                    N0, InFlag);
-      InFlag = Chain.getValue(1);
-
-      if (isSigned) {
-        // Sign extend the low part into the high part.
-        InFlag =
-          SDOperand(CurDAG->getTargetNode(SExtOpcode, MVT::Flag, InFlag), 0);
-      } else {
-        // Zero out the high part, effectively zero extending the input.
-        SDOperand ClrNode = SDOperand(CurDAG->getTargetNode(ClrOpcode, NVT), 0);
-        Chain  = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(HiReg, NVT),
-                                      ClrNode, InFlag);
+      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+
+      SDOperand InFlag;
+      if (NVT == MVT::i8 && !isSigned) {
+        // Special case for div8, just use a move with zero extension to AX to
+        // clear the upper 8 bits (AH).
+        SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Move, Chain;
+        if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3)) {
+          SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N0.getOperand(0) };
+          AddToISelQueue(N0.getOperand(0));
+          AddToISelQueue(Tmp0);
+          AddToISelQueue(Tmp1);
+          AddToISelQueue(Tmp2);
+          AddToISelQueue(Tmp3);
+          Move =
+            SDOperand(CurDAG->getTargetNode(X86::MOVZX16rm8, MVT::i16, MVT::Other,
+                                            Ops, 5), 0);
+          Chain = Move.getValue(1);
+          ReplaceUses(N0.getValue(1), Chain);
+        } else {
+          AddToISelQueue(N0);
+          Move =
+            SDOperand(CurDAG->getTargetNode(X86::MOVZX16rr8, MVT::i16, N0), 0);
+          Chain = CurDAG->getEntryNode();
+        }
+        Chain  = CurDAG->getCopyToReg(Chain, X86::AX, Move, SDOperand());
          InFlag = Chain.getValue(1);
+      } else {
+        AddToISelQueue(N0);
+        InFlag =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(),
+                               LoReg, N0, SDOperand()).getValue(1);
+        if (isSigned) {
+          // Sign extend the low part into the high part.
+          InFlag =
+            SDOperand(CurDAG->getTargetNode(SExtOpcode, MVT::Flag, InFlag), 0);
+        } else {
+          // Zero out the high part, effectively zero extending the input.
+          SDOperand ClrNode = SDOperand(CurDAG->getTargetNode(ClrOpcode, NVT), 0);
+          InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), HiReg,
+                                        ClrNode, InFlag).getValue(1);
+        }
        }
  
        if (foldedLoad) {
+        AddToISelQueue(N1.getOperand(0));
          AddToISelQueue(Tmp0);
          AddToISelQueue(Tmp1);
          AddToISelQueue(Tmp2);
          AddToISelQueue(Tmp3);
-        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Chain, InFlag };
+        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N1.getOperand(0), InFlag };
          SDNode *CNode =
            CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
-        Chain  = SDOperand(CNode, 0);
          InFlag = SDOperand(CNode, 1);
+        // Update the chain.
+        ReplaceUses(N1.getValue(1), SDOperand(CNode, 0));
        } else {
          AddToISelQueue(N1);
          InFlag =
            SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
        }
  
-      SDOperand Result = CurDAG->getCopyFromReg(Chain, isDiv ? LoReg : HiReg,
-                                                NVT, InFlag);
-      ReplaceUses(N.getValue(0), Result);
-      if (foldedLoad)
-        ReplaceUses(N1.getValue(1), Result.getValue(1));
+      // Copy the division (low) result, if it is needed.
+      if (!N.getValue(0).use_empty()) {
+        SDOperand Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                                  LoReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
+        ReplaceUses(N.getValue(0), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.Val->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
+      // Copy the remainder (high) result, if it is needed.
+      if (!N.getValue(1).use_empty()) {
+        SDOperand Result;
+        if (HiReg == X86::AH && Subtarget->is64Bit()) {
+          // Prevent use of AH in a REX instruction by referencing AX instead.
+          // Shift it down 8 bits.
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                          X86::AX, MVT::i16, InFlag);
+          InFlag = Result.getValue(2);
+          Result = SDOperand(CurDAG->getTargetNode(X86::SHR16ri, MVT::i16, Result,
+                                       CurDAG->getTargetConstant(8, MVT::i8)), 0);
+          // Then truncate it down to i8.
+          SDOperand SRIdx = CurDAG->getTargetConstant(1, MVT::i32); // SubRegSet 1
+          Result = SDOperand(CurDAG->getTargetNode(X86::EXTRACT_SUBREG,
+                                                   MVT::i8, Result, SRIdx), 0);
+        } else {
+          Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                          HiReg, NVT, InFlag);
+          InFlag = Result.getValue(2);
+        }
+        ReplaceUses(N.getValue(1), Result);
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(Result.Val->dump(CurDAG));
+        DOUT << "\n";
+#endif
+      }
  
  #ifndef NDEBUG
-      DEBUG(std::cerr << std::string(Indent-2, ' '));
-      DEBUG(std::cerr << "=> ");
-      DEBUG(Result.Val->dump(CurDAG));
-      DEBUG(std::cerr << "\n");
        Indent -= 2;
  #endif
  
        return NULL;
      }
  
-    case ISD::TRUNCATE: {
-      if (!Subtarget->is64Bit() && NVT == MVT::i8) {
-        unsigned Opc2;
-        MVT::ValueType VT;
-        switch (Node->getOperand(0).getValueType()) {
-        default: assert(0 && "Unknown truncate!");
+    case ISD::ANY_EXTEND: {
+      SDOperand N0 = Node->getOperand(0);
+      AddToISelQueue(N0);
+      if (NVT == MVT::i64 || NVT == MVT::i32 || NVT == MVT::i16) {
+        SDOperand SRIdx;
+        switch(N0.getValueType()) {
+        case MVT::i32:
+          SRIdx = CurDAG->getTargetConstant(X86::SUBREG_32BIT, MVT::i32);
+          break;
          case MVT::i16:
-          Opc = X86::MOV16to16_;
-          VT = MVT::i16;
-          Opc2 = X86::TRUNC_16_to8;
+          SRIdx = CurDAG->getTargetConstant(X86::SUBREG_16BIT, MVT::i32);
            break;
-        case MVT::i32:
-          Opc = X86::MOV32to32_;
-          VT = MVT::i32;
-          Opc2 = X86::TRUNC_32_to8;
+        case MVT::i8:
+          if (Subtarget->is64Bit())
+            SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
            break;
+        default: assert(0 && "Unknown any_extend!");
          }
+        if (SRIdx.Val) {
+          SDOperand ImplVal = 
+              CurDAG->getTargetConstant(X86InstrInfo::IMPL_VAL_UNDEF, MVT::i32);
+          SDNode *ResNode = CurDAG->getTargetNode(X86::INSERT_SUBREG,
+                                                  NVT, ImplVal, N0, SRIdx);
  
-        AddToISelQueue(Node->getOperand(0));
-        SDOperand Tmp =
-          SDOperand(CurDAG->getTargetNode(Opc, VT, Node->getOperand(0)), 0);
-        SDNode *ResNode = CurDAG->getTargetNode(Opc2, NVT, Tmp);
+#ifndef NDEBUG
+          DOUT << std::string(Indent-2, ' ') << "=> ";
+          DEBUG(ResNode->dump(CurDAG));
+          DOUT << "\n";
+          Indent -= 2;
+#endif
+          return ResNode;
+        } // Otherwise let generated ISel handle it.
+      }
+      break;
+    }
+    
+    case ISD::SIGN_EXTEND_INREG: {
+      SDOperand N0 = Node->getOperand(0);
+      AddToISelQueue(N0);
+      
+      MVT::ValueType SVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+      SDOperand TruncOp = SDOperand(getTruncate(N0, SVT), 0);
+      unsigned Opc = 0;
+      switch (NVT) {
+      case MVT::i16:
+        if (SVT == MVT::i8) Opc = X86::MOVSX16rr8;
+        else assert(0 && "Unknown sign_extend_inreg!");
+        break;
+      case MVT::i32:
+        switch (SVT) {
+        case MVT::i8:  Opc = X86::MOVSX32rr8;  break;
+        case MVT::i16: Opc = X86::MOVSX32rr16; break;
+        default: assert(0 && "Unknown sign_extend_inreg!");
+        }
+        break;
+      case MVT::i64:
+        switch (SVT) {
+        case MVT::i8:  Opc = X86::MOVSX64rr8;  break;
+        case MVT::i16: Opc = X86::MOVSX64rr16; break;
+        case MVT::i32: Opc = X86::MOVSX64rr32; break;
+        default: assert(0 && "Unknown sign_extend_inreg!");
+        }
+        break;
+      default: assert(0 && "Unknown sign_extend_inreg!");
+      }
+      
+      SDNode *ResNode = CurDAG->getTargetNode(Opc, NVT, TruncOp);
        
  #ifndef NDEBUG
-        DEBUG(std::cerr << std::string(Indent-2, ' '));
-        DEBUG(std::cerr << "=> ");
+      DOUT << std::string(Indent-2, ' ') << "=> ";
+      DEBUG(TruncOp.Val->dump(CurDAG));
+      DOUT << "\n";
+      DOUT << std::string(Indent-2, ' ') << "=> ";
+      DEBUG(ResNode->dump(CurDAG));
+      DOUT << "\n";
+      Indent -= 2;
+#endif
+      return ResNode;
+      break;
+    }
+    
+    case ISD::TRUNCATE: {
+      SDOperand Input = Node->getOperand(0);
+      AddToISelQueue(Node->getOperand(0));
+      SDNode *ResNode = getTruncate(Input, NVT);
+      
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
          DEBUG(ResNode->dump(CurDAG));
-        DEBUG(std::cerr << "\n");
+        DOUT << "\n";
          Indent -= 2;
  #endif
-        return ResNode;
-      }
-
+      return ResNode;
        break;
      }
    }
@@ -1130,13 +1628,12 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
    SDNode *ResNode = SelectCode(N);
  
  #ifndef NDEBUG
-  DEBUG(std::cerr << std::string(Indent-2, ' '));
-  DEBUG(std::cerr << "=> ");
+  DOUT << std::string(Indent-2, ' ') << "=> ";
    if (ResNode == NULL || ResNode == N.Val)
      DEBUG(N.Val->dump(CurDAG));
    else
      DEBUG(ResNode->dump(CurDAG));
-  DEBUG(std::cerr << "\n");
+  DOUT << "\n";
    Indent -= 2;
  #endif
  
@@ -1152,7 +1649,7 @@ SelectInlineAsmMemoryOperand(const SDOperand &Op, char ConstraintCode,
    case 'v':   // not offsetable    ??
    default: return true;
    case 'm':   // memory
-    if (!SelectAddr(Op, Op0, Op1, Op2, Op3))
+    if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3))
        return true;
      break;
    }