From 2038252c6a36efd18cc0bef216fa2c5bb9236617 Mon Sep 17 00:00:00 2001
From: Dan Gohman <gohman@apple.com>
Date: Tue, 10 Jul 2007 00:05:58 +0000
Subject: [PATCH] Define non-intrinsic instructions for vector min, max, sqrt,
 rsqrt, and rcp, in addition to the intrinsic forms. Add spill-folding entries
 for these new instructions, and for the scalar min and max instrinsic
 instructions which were missing. And add some preliminary ISelLowering code
 for using the new non-intrinsic vector sqrt instruction, and fneg and fabs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@38478 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp |  57 +++-
 lib/Target/X86/X86ISelLowering.h   |   6 +
 lib/Target/X86/X86InstrSSE.td      | 510 +++++++++++++++++++----------
 lib/Target/X86/X86RegisterInfo.cpp |  20 ++
 4 files changed, 398 insertions(+), 195 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b8dad13ee0f..3bf2b9f6c5d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -331,6 +331,13 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE,     (MVT::ValueType)VT, Expand);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand);
   }
 
   if (Subtarget->hasMMX()) {
@@ -408,6 +415,9 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
@@ -435,6 +445,9 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
@@ -3326,16 +3339,21 @@ SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
 
 SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
   MVT::ValueType VT = Op.getValueType();
-  const Type *OpNTy =  MVT::getTypeForValueType(VT);
+  MVT::ValueType EltVT = VT;
+  if (MVT::isVector(VT))
+    EltVT = MVT::getVectorElementType(VT);
+  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
   std::vector<Constant*> CV;
-  if (VT == MVT::f64) {
-    CV.push_back(ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63))));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63)));
+    CV.push_back(C);
+    CV.push_back(C);
   } else {
-    CV.push_back(ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31))));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
+    Constant *C = ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31)));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
   }
   Constant *CS = ConstantStruct::get(CV);
   SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
@@ -3350,16 +3368,21 @@ SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
 
 SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
   MVT::ValueType VT = Op.getValueType();
-  const Type *OpNTy =  MVT::getTypeForValueType(VT);
+  MVT::ValueType EltVT = VT;
+  if (MVT::isVector(VT))
+    EltVT = MVT::getVectorElementType(VT);
+  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
   std::vector<Constant*> CV;
-  if (VT == MVT::f64) {
-    CV.push_back(ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63)));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63));
+    CV.push_back(C);
+    CV.push_back(C);
   } else {
-    CV.push_back(ConstantFP::get(OpNTy, BitsToFloat(1U << 31)));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
-    CV.push_back(ConstantFP::get(OpNTy, 0.0));
+    Constant *C = ConstantFP::get(OpNTy, BitsToFloat(1U << 31));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
   }
   Constant *CS = ConstantStruct::get(CV);
   SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
@@ -4284,6 +4307,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::THREAD_POINTER:     return "X86ISD::THREAD_POINTER";
   }
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 00d93755bd1..b9aaefa5c87 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -177,6 +177,12 @@ namespace llvm {
       /// FMAX, FMIN - Floating point max and min.
       ///
       FMAX, FMIN,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
       // Thread Local Storage
       TLSADDR, THREAD_POINTER
     };
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 2cbd31e5ea4..5fc7a65a084 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -31,6 +31,8 @@ def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest,
                         [SDNPHasChain, SDNPOutFlag]>;
@@ -247,16 +249,6 @@ class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
 class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
       : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
 
-// Helpers for defining instructions that directly correspond to intrinsics.
-multiclass SS_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> {
-  def r : SSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
-              !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v4f32 (IntId VR128:$src)))]>;
-  def m : SSI<o, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
-              !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v4f32 (IntId sse_load_f32:$src)))]>;
-}
-
 // Move Instructions
 def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src),
                   "movss {$src, $dst|$dst, $src}", []>;
@@ -267,18 +259,6 @@ def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src),
                   "movss {$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>;
 
-def SQRTSSr : SSI<0x51, MRMSrcReg, (ops FR32:$dst, FR32:$src),
-                  "sqrtss {$src, $dst|$dst, $src}",
-                  [(set FR32:$dst, (fsqrt FR32:$src))]>;
-def SQRTSSm : SSI<0x51, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
-                  "sqrtss {$src, $dst|$dst, $src}",
-                  [(set FR32:$dst, (fsqrt (loadf32 addr:$src)))]>;
-
-// Aliases to match intrinsics which expect XMM operand(s).
-defm SQRTSS_Int  : SS_IntUnary<0x51, "sqrtss" , int_x86_sse_sqrt_ss>;
-defm RSQRTSS_Int : SS_IntUnary<0x52, "rsqrtss", int_x86_sse_rsqrt_ss>;
-defm RCPSS_Int   : SS_IntUnary<0x53, "rcpss"  , int_x86_sse_rcp_ss>;
-
 // Conversion instructions
 def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src),
                       "cvttss2si {$src, $dst|$dst, $src}",
@@ -425,20 +405,20 @@ def FsANDNPSrm : PSI<0x55, MRMSrcMem,
                      "andnps {$src2, $dst|$dst, $src2}", []>;
 }
 
-/// scalar_sse1_fp_binop_rm - Scalar SSE1 binops come in three basic forms:
-///  
-///  1. f32 - This comes in SSE1 form for floats.
-///  2. rr vs rm - They include a reg+reg form and a reg+mem form.
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
 ///
-/// In addition, scalar SSE ops have an intrinsic form.  This form is unlike the
-/// normal form, in that they take an entire vector (instead of a scalar) and
-/// leave the top elements undefined.  This adds another two variants of the
-/// above permutations, giving us 8 forms for 'instruction'.
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
 ///
 let isTwoAddress = 1 in {
-multiclass scalar_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                                   SDNode OpNode, Intrinsic F32Int,
-                                   bit Commutable = 0> {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F32Int,
+                                  bit Commutable = 0> {
   // Scalar operation, reg+reg.
   def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
                  !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
@@ -451,14 +431,26 @@ multiclass scalar_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
                  !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
                  [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
                  
-  // Vector intrinsic operation, reg+reg.
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
   def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
-  // Vector intrinsic operation, reg+mem.
+  // Intrinsic operation, reg+mem.
   def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
                      !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F32Int VR128:$src1,
@@ -467,13 +459,82 @@ multiclass scalar_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
 }
 
 // Arithmetic instructions
-defm ADD : scalar_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
-defm MUL : scalar_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
-defm SUB : scalar_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
-defm DIV : scalar_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F32Int,
+                            Intrinsic V4F32Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PSrr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PSrm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+                     !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
 
-defm MAX : scalar_sse1_fp_binop_rm<0x5F, "max", X86fmax, int_x86_sse_max_ss>;
-defm MIN : scalar_sse1_fp_binop_rm<0x5D, "min", X86fmin, int_x86_sse_min_ss>;
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse_min_ss, int_x86_sse_min_ps>;
 
 //===----------------------------------------------------------------------===//
 // SSE packed FP Instructions
@@ -550,70 +611,85 @@ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
 
 
 
-/// packed_sse1_fp_binop_rm - Packed SSE binops come in three basic forms:
-///  1. v4f32    - This comes in SSE1 form for float.
-///  2. rr vs rm - They include a reg+reg form and a ref+mem form.
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
 ///
-let isTwoAddress = 1 in {
-multiclass packed_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                                   SDNode OpNode, bit Commutable = 0> {
-  // Packed operation, reg+reg.
-  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F32Int,
+                           Intrinsic V4F32Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SSr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]> {
     let isCommutable = Commutable;
   }
 
-  // Packed operation, reg+mem.
-  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
-                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
-                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
-}
-}
-
-defm ADD : packed_sse1_fp_binop_rm<0x58, "add", fadd, 1>;
-defm MUL : packed_sse1_fp_binop_rm<0x59, "mul", fmul, 1>;
-defm DIV : packed_sse1_fp_binop_rm<0x5E, "div", fdiv>;
-defm SUB : packed_sse1_fp_binop_rm<0x5C, "sub", fsub>;
-
-// Arithmetic
+  // Scalar operation, mem.
+  def SSm : SSI<opc, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+                !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PSr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+              !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
 
-class PS_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
-        !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-        [(set VR128:$dst, (IntId VR128:$src))]>;
-class PS_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PSI<o, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
-        !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-        [(set VR128:$dst, (IntId (load addr:$src)))]>;
-
-class PS_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PSI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
-        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
-        [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
-class PS_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PSI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
-        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
-        [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>;
+  // Vector operation, mem.
+  def PSm : PSI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>;
 
-def SQRTPSr  : PS_Intr<0x51, "sqrtps", int_x86_sse_sqrt_ps>;
-def SQRTPSm  : PS_Intm<0x51, "sqrtps", int_x86_sse_sqrt_ps>;
+  // Intrinsic operation, reg.
+  def SSr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
 
-def RSQRTPSr : PS_Intr<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>;
-def RSQRTPSm : PS_Intm<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>;
-def RCPPSr   : PS_Intr<0x53, "rcpps", int_x86_sse_rcp_ps>;
-def RCPPSm   : PS_Intm<0x53, "rcpps", int_x86_sse_rcp_ps>;
+  // Intrinsic operation, mem.
+  def SSm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
+                    !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
 
-let isTwoAddress = 1 in {
-  let isCommutable = 1 in {
-    def MAXPSrr  : PS_Intrr<0x5F, "maxps", int_x86_sse_max_ps>;
-    def MINPSrr  : PS_Intrr<0x5D, "minps", int_x86_sse_min_ps>;
+  // Vector intrinsic operation, reg
+  def PSr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
   }
 
-  def MAXPSrm  : PS_Intrm<0x5F, "maxps", int_x86_sse_max_ps>;
-  def MINPSrm  : PS_Intrm<0x5D, "minps", int_x86_sse_min_ps>;
+  // Vector intrinsic operation, mem
+  def PSm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                    !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (load addr:$src)))]>;
 }
 
+// Square root.
+defm SQRT  : sse1_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+                             int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP   : sse1_fp_unop_rm<0x53, "rcp",   X86frcp,
+                             int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
 // Logical
 let isTwoAddress = 1 in {
   let isCommutable = 1 in {
@@ -835,16 +911,6 @@ class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
 class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
       : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
 
-// Helpers for defining instructions that directly correspond to intrinsics.
-multiclass SD_IntUnary<bits<8> o, string OpcodeStr, Intrinsic IntId> {
-  def r : SDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
-              !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v2f64 (IntId VR128:$src)))]>;
-  def m : SDI<o, MRMSrcMem, (ops VR128:$dst, sdmem:$src),
-              !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-              [(set VR128:$dst, (v2f64 (IntId sse_load_f64:$src)))]>;
-}
-
 // Move Instructions
 def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src),
                   "movsd {$src, $dst|$dst, $src}", []>;
@@ -855,16 +921,6 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src),
                   "movsd {$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>;
 
-def SQRTSDr : SDI<0x51, MRMSrcReg, (ops FR64:$dst, FR64:$src),
-                  "sqrtsd {$src, $dst|$dst, $src}",
-                  [(set FR64:$dst, (fsqrt FR64:$src))]>;
-def SQRTSDm : SDI<0x51, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
-                  "sqrtsd {$src, $dst|$dst, $src}",
-                  [(set FR64:$dst, (fsqrt (loadf64 addr:$src)))]>;
-
-// Aliases to match intrinsics which expect XMM operand(s).
-defm SQRTSD_Int : SD_IntUnary<0x51, "sqrtsd" , int_x86_sse2_sqrt_sd>;
-
 // Conversion instructions
 def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, FR64:$src),
                       "cvttsd2si {$src, $dst|$dst, $src}",
@@ -1013,20 +1069,20 @@ def FsANDNPDrm : PDI<0x55, MRMSrcMem,
                      "andnpd {$src2, $dst|$dst, $src2}", []>;
 }
 
-/// scalar_sse2_fp_binop_rm - Scalar SSE2 binops come in three basic forms:
-///  
-///  1. f64 - This comes in SSE2 form for doubles.
-///  2. rr vs rm - They include a reg+reg form and a reg+mem form.
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
 ///
-/// In addition, scalar SSE ops have an intrinsic form.  This form is unlike the
-/// normal form, in that they take an entire vector (instead of a scalar) and
-/// leave the top elements undefined.  This adds another two variants of the
-/// above permutations, giving us 8 forms for 'instruction'.
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
 ///
 let isTwoAddress = 1 in {
-multiclass scalar_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                                   SDNode OpNode, Intrinsic F64Int,
-                                   bit Commutable = 0> {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F64Int,
+                                  bit Commutable = 0> {
   // Scalar operation, reg+reg.
   def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
                  !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
@@ -1039,14 +1095,26 @@ multiclass scalar_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
                  !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
                  [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
                  
-  // Vector intrinsic operation, reg+reg.
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
   def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
     let isCommutable = Commutable;
   }
 
-  // Vector intrinsic operation, reg+mem.
+  // Intrinsic operation, reg+mem.
   def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
                      !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (F64Int VR128:$src1,
@@ -1055,13 +1123,82 @@ multiclass scalar_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
 }
 
 // Arithmetic instructions
-defm ADD : scalar_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
-defm MUL : scalar_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
-defm SUB : scalar_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
-defm DIV : scalar_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
 
-defm MAX : scalar_sse2_fp_binop_rm<0x5F, "max", X86fmax, int_x86_sse2_max_sd>;
-defm MIN : scalar_sse2_fp_binop_rm<0x5D, "min", X86fmin, int_x86_sse2_min_sd>;
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F64Int,
+                            Intrinsic V2F64Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PDrr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PDrm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                     !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
 
 //===----------------------------------------------------------------------===//
 // SSE packed FP Instructions
@@ -1234,65 +1371,80 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                     Requires<[HasSSE2]>;
 }
 
-/// packed_sse2_fp_binop_rm - Packed SSE binops come in three basic forms:
-///  1. v2f64    - This comes in SSE2 form for doubles.
-///  2. rr vs rm - They include a reg+reg form and a ref+mem form.
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
 ///
-let isTwoAddress = 1 in {
-multiclass packed_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
-                                   SDNode OpNode, bit Commutable = 0> {
-  // Packed operation, reg+reg.
-  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F64Int,
+                           Intrinsic V2F64Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SDr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))]> {
     let isCommutable = Commutable;
   }
 
-  // Packed operation, reg+mem.
-  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
-                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
-                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
-}
-}
-
-defm ADD : packed_sse2_fp_binop_rm<0x58, "add", fadd, 1>;
-defm MUL : packed_sse2_fp_binop_rm<0x59, "mul", fmul, 1>;
-defm DIV : packed_sse2_fp_binop_rm<0x5E, "div", fdiv>;
-defm SUB : packed_sse2_fp_binop_rm<0x5C, "sub", fsub>;
+  // Scalar operation, mem.
+  def SDm : SDI<opc, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+                !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PDr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+              !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
 
-// Arithmetic
+  // Vector operation, mem.
+  def PDm : PDI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>;
 
-class PD_Intr<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src),
-        !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-        [(set VR128:$dst, (IntId VR128:$src))]>;
-class PD_Intm<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PDI<o, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
-        !strconcat(OpcodeStr, " {$src, $dst|$dst, $src}"),
-        [(set VR128:$dst, (IntId (load addr:$src)))]>;
-
-class PD_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PDI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
-        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
-        [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
-class PD_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
-  : PDI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
-        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
-        [(set VR128:$dst, (IntId VR128:$src1, (load addr:$src2)))]>;
+  // Intrinsic operation, reg.
+  def SDr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
 
-def SQRTPDr  : PD_Intr<0x51, "sqrtpd", int_x86_sse2_sqrt_pd>;
-def SQRTPDm  : PD_Intm<0x51, "sqrtpd", int_x86_sse2_sqrt_pd>;
+  // Intrinsic operation, mem.
+  def SDm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, sdmem:$src),
+                    !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
 
-let isTwoAddress = 1 in {
-  let isCommutable = 1 in {
-    def MAXPDrr  : PD_Intrr<0x5F, "maxpd", int_x86_sse2_max_pd>;
-    def MINPDrr  : PD_Intrr<0x5D, "minpd", int_x86_sse2_min_pd>;
+  // Vector intrinsic operation, reg
+  def PDr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
   }
 
-  def MAXPDrm  : PD_Intrm<0x5F, "maxpd", int_x86_sse2_max_pd>;
-  def MINPDrm  : PD_Intrm<0x5D, "minpd", int_x86_sse2_min_pd>;
+  // Vector intrinsic operation, mem
+  def PDm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                    !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int (load addr:$src)))]>;
 }
 
+// Square root.
+defm SQRT  : sse2_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
 // Logical
 let isTwoAddress = 1 in {
   let isCommutable = 1 in {
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 56ff6a7a82c..3ea437fe1e8 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -758,9 +758,21 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
       { X86::IMUL16rr,        X86::IMUL16rm },
       { X86::IMUL32rr,        X86::IMUL32rm },
       { X86::MAXPDrr,         X86::MAXPDrm },
+      { X86::MAXPDrr_Int,     X86::MAXPDrm_Int },
       { X86::MAXPSrr,         X86::MAXPSrm },
+      { X86::MAXPSrr_Int,     X86::MAXPSrm_Int },
+      { X86::MAXSDrr,         X86::MAXSDrm },
+      { X86::MAXSDrr_Int,     X86::MAXSDrm_Int },
+      { X86::MAXSSrr,         X86::MAXSSrm },
+      { X86::MAXSSrr_Int,     X86::MAXSSrm_Int },
       { X86::MINPDrr,         X86::MINPDrm },
+      { X86::MINPDrr_Int,     X86::MINPDrm_Int },
       { X86::MINPSrr,         X86::MINPSrm },
+      { X86::MINPSrr_Int,     X86::MINPSrm_Int },
+      { X86::MINSDrr,         X86::MINSDrm },
+      { X86::MINSDrr_Int,     X86::MINSDrm_Int },
+      { X86::MINSSrr,         X86::MINSSrm },
+      { X86::MINSSrr_Int,     X86::MINSSrm_Int },
       { X86::MULPDrr,         X86::MULPDrm },
       { X86::MULPSrr,         X86::MULPSrm },
       { X86::MULSDrr,         X86::MULSDrm },
@@ -825,15 +837,23 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
       { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm },
       { X86::PXORrr,          X86::PXORrm },
       { X86::RCPPSr,          X86::RCPPSm },
+      { X86::RCPPSr_Int,      X86::RCPPSm_Int },
       { X86::RSQRTPSr,        X86::RSQRTPSm },
+      { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int },
+      { X86::RSQRTSSr,        X86::RSQRTSSm },
+      { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int },
       { X86::SBB32rr,         X86::SBB32rm },
       { X86::SBB64rr,         X86::SBB64rm },
       { X86::SHUFPDrri,       X86::SHUFPDrmi },
       { X86::SHUFPSrri,       X86::SHUFPSrmi },
       { X86::SQRTPDr,         X86::SQRTPDm },
+      { X86::SQRTPDr_Int,     X86::SQRTPDm_Int },
       { X86::SQRTPSr,         X86::SQRTPSm },
+      { X86::SQRTPSr_Int,     X86::SQRTPSm_Int },
       { X86::SQRTSDr,         X86::SQRTSDm },
+      { X86::SQRTSDr_Int,     X86::SQRTSDm_Int },
       { X86::SQRTSSr,         X86::SQRTSSm },
+      { X86::SQRTSSr_Int,     X86::SQRTSSm_Int },
       { X86::SUB16rr,         X86::SUB16rm },
       { X86::SUB32rr,         X86::SUB32rm },
       { X86::SUB64rr,         X86::SUB64rm },
-- 
2.34.1