First round of support for doing scalar FP using the SSE2 ISA extension and

author Nate Begeman <natebegeman@mac.com>

Wed, 6 Jul 2005 18:59:04 +0000 (18:59 +0000)

committer Nate Begeman <natebegeman@mac.com>

Wed, 6 Jul 2005 18:59:04 +0000 (18:59 +0000)
author Nate Begeman <natebegeman@mac.com>
Wed, 6 Jul 2005 18:59:04 +0000 (18:59 +0000)
committer Nate Begeman <natebegeman@mac.com>
Wed, 6 Jul 2005 18:59:04 +0000 (18:59 +0000)
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp

index f4b9039f482be573d82d5995c2fb30838a1fd55a..90f10c1fccb1cb526c0cd7216a96ad7cf54409d0 100644 (file)
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -27,6 +27,7 @@ namespace llvm {
    bool NoExcessFPPrecision;
    int  PatternISelTriState;
    bool UnsafeFPMath;
+  bool PICEnabled;
  };
  namespace {
    cl::opt<bool, true> PrintCode("print-machineinstrs",
@@ -52,6 +53,11 @@ namespace {
                 cl::desc("Enable optimizations that may decrease FP precision"),
                 cl::location(UnsafeFPMath),
                 cl::init(false));
+  cl::opt<bool, true>
+  EnablePIC("enable-pic",
+               cl::desc("Enable generation of position independant code"),
+               cl::location(PICEnabled),
+               cl::init(false));
  };
  
  //---------------------------------------------------------------------------
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h

index 1bb4ec2e9c3bbd109ca5a0d62f12497751200cb3..2fc022af4b132ee6b6209558e1d52fd0f515bc6f 100644 (file)
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -28,6 +28,7 @@ enum X86VectorEnum {
  };
  
  extern X86VectorEnum X86Vector;
+extern bool X86ScalarSSE;
  
  /// createX86SimpleInstructionSelector - This pass converts an LLVM function
  /// into a machine code representation in a very simple peep-hole fashion.  The
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td

index 874391dda9fd062eca31612c435ba35b89b2fe88..afa3ff7b71cad7c96a6bf4728d697dfb3cfa7f0b 100644 (file)
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -61,7 +61,7 @@ def IntelAsmWriter : AsmWriter {
  
  def X86 : Target {
    // Specify the callee saved registers.
-  let CalleeSavedRegisters = [ESI, EDI, EBX, EBP];
+  let CalleeSavedRegisters = [ESI, EDI, EBX, EBP, XMM4, XMM5, XMM6, XMM7];
  
    // Yes, pointers are 32-bits in size.
    let PointerType = i32;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp

index d55edc5841c472e165b6a2a18dffe871e54bfa81..789b8e26cef3e3140c5314882726ead09093c505 100644 (file)
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -361,8 +361,18 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
    // Emit the repeat opcode prefix as needed.
    if ((Desc.TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
  
-  // Emit instruction prefixes if necessary
-  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);// Operand size...
+  // Emit the operand size opcode prefix as needed.
+  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+  // Emit the double precision sse fp opcode prefix as needed.
+  if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XD) {
+    MCE.emitByte(0xF2); MCE.emitByte(0x0F);
+  }
+
+  // Emit the double precision sse fp opcode prefix as needed.
+  if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XS) {
+    MCE.emitByte(0xF3); MCE.emitByte(0x0F);
+  }
  
    switch (Desc.TSFlags & X86II::Op0Mask) {
    case X86II::TB:
diff --git a/lib/Target/X86/X86ISelPattern.cpp b/lib/Target/X86/X86ISelPattern.cpp

index bc728a7a41b104b83f93e5894e1b6ddbddd6a870..5c561aa1716273e02d525b66857229642ec9bee5 100644 (file)
--- a/lib/Target/X86/X86ISelPattern.cpp
+++ b/lib/Target/X86/X86ISelPattern.cpp
@@ -97,15 +97,13 @@ namespace {
        setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
  
        // Set up the register classes.
+      // FIXME: Eliminate these two classes when legalize can handle promotions
+      // well.
+      addRegisterClass(MVT::i1, X86::R8RegisterClass);
        addRegisterClass(MVT::i8, X86::R8RegisterClass);
        addRegisterClass(MVT::i16, X86::R16RegisterClass);
        addRegisterClass(MVT::i32, X86::R32RegisterClass);
-      addRegisterClass(MVT::f64, X86::RFPRegisterClass);
-
-      // FIXME: Eliminate these two classes when legalize can handle promotions
-      // well.
-/**/  addRegisterClass(MVT::i1, X86::R8RegisterClass);
-
+      
        setOperationAction(ISD::SINT_TO_FP       , MVT::i64  , Custom);
        setOperationAction(ISD::BRCONDTWOWAY     , MVT::Other, Expand);
        setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
@@ -123,7 +121,7 @@ namespace {
        setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
        setOperationAction(ISD::CTTZ             , MVT::i32  , Expand);
        setOperationAction(ISD::CTLZ             , MVT::i32  , Expand);
-
+      
        setOperationAction(ISD::READIO           , MVT::i1   , Expand);
        setOperationAction(ISD::READIO           , MVT::i8   , Expand);
        setOperationAction(ISD::READIO           , MVT::i16  , Expand);
@@ -132,24 +130,47 @@ namespace {
        setOperationAction(ISD::WRITEIO          , MVT::i8   , Expand);
        setOperationAction(ISD::WRITEIO          , MVT::i16  , Expand);
        setOperationAction(ISD::WRITEIO          , MVT::i32  , Expand);
-
-      if (!UnsafeFPMath) {
-        setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
-        setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
-      }
-
+      
        // These should be promoted to a larger select which is supported.
-/**/  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
+      setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
        setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
-
+      
+      if (X86ScalarSSE) {
+        // Set up the FP register classes.
+        addRegisterClass(MVT::f32, X86::RXMMRegisterClass);
+        addRegisterClass(MVT::f64, X86::RXMMRegisterClass);
+        
+        setOperationAction(ISD::EXTLOAD,  MVT::f32, Expand);
+        setOperationAction(ISD::ZEXTLOAD, MVT::f32, Expand);
+        
+        // We don't support sin/cos/sqrt/fmod
+        setOperationAction(ISD::FSIN , MVT::f64, Expand);
+        setOperationAction(ISD::FCOS , MVT::f64, Expand);
+        setOperationAction(ISD::FABS , MVT::f64, Expand);
+        setOperationAction(ISD::FNEG , MVT::f64, Expand);
+        setOperationAction(ISD::SREM , MVT::f64, Expand);
+        setOperationAction(ISD::FSIN , MVT::f32, Expand);
+        setOperationAction(ISD::FCOS , MVT::f32, Expand);
+        setOperationAction(ISD::FABS , MVT::f32, Expand);
+        setOperationAction(ISD::FNEG , MVT::f32, Expand);
+        setOperationAction(ISD::SREM , MVT::f32, Expand);
+      } else {
+        // Set up the FP register classes.
+        addRegisterClass(MVT::f64, X86::RFPRegisterClass);
+        
+        if (!UnsafeFPMath) {
+          setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+          setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+        }
+        
+        addLegalFPImmediate(+0.0); // FLD0
+        addLegalFPImmediate(+1.0); // FLD1
+        addLegalFPImmediate(-0.0); // FLD0/FCHS
+        addLegalFPImmediate(-1.0); // FLD1/FCHS
+      }
        computeRegisterProperties();
-
-      addLegalFPImmediate(+0.0); // FLD0
-      addLegalFPImmediate(+1.0); // FLD1
-      addLegalFPImmediate(-0.0); // FLD0/FCHS
-      addLegalFPImmediate(-1.0); // FLD1/FCHS
      }
-
+    
      // Return the number of bytes that a function should pop when it returns (in
      // addition to the space used by the return address).
      //
@@ -400,7 +421,10 @@ X86TargetLowering::LowerCCCCallTo(SDOperand Chain, const Type *RetTy,
      RetVals.push_back(MVT::i32);
      break;
    case MVT::f32:
-    RetVals.push_back(MVT::f64);
+    if (X86ScalarSSE)
+      RetVals.push_back(MVT::f32);
+    else
+      RetVals.push_back(MVT::f64);
      break;
    case MVT::i64:
      RetVals.push_back(MVT::i32);
@@ -805,7 +829,10 @@ X86TargetLowering::LowerFastCCCallTo(SDOperand Chain, const Type *RetTy,
      RetVals.push_back(MVT::i32);
      break;
    case MVT::f32:
-    RetVals.push_back(MVT::f64);
+    if (X86ScalarSSE)
+      RetVals.push_back(MVT::f32);
+    else
+      RetVals.push_back(MVT::f64);
      break;
    case MVT::i64:
      RetVals.push_back(MVT::i32);
@@ -1041,6 +1068,8 @@ void ISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
          BuildMI(BB, X86::MOV32rr, 1, LI->second).addReg(LI->first);
        } else if (RC == X86::RFPRegisterClass) {
          BuildMI(BB, X86::FpMOV, 1, LI->second).addReg(LI->first);
+      } else if (RC == X86::RXMMRegisterClass) {
+        BuildMI(BB, X86::MOVAPDrr, 1, LI->second).addReg(LI->first);
        } else {
          assert(0 && "Unknown regclass!");
        }
@@ -1641,6 +1670,11 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
      /*missing*/0,  /*missing*/0, X86::FCMOVB , X86::FCMOVBE,
      X86::FCMOVA ,  X86::FCMOVAE, X86::FCMOVP , X86::FCMOVNP
    };
+  static const unsigned SSE_CMOVTAB[] = {
+    0 /* CMPEQSS */, 4 /* CMPNEQSS */, 1 /* CMPLTSS */, 2 /* CMPLESS */,
+    2 /* CMPLESS */, 1 /* CMPLTSS */, /*missing*/0, /*missing*/0,
+    /*missing*/0,  /*missing*/0, /*missing*/0, /*missing*/0
+  };
  
    if (SetCCSDNode *SetCC = dyn_cast<SetCCSDNode>(Cond)) {
      if (MVT::isInteger(SetCC->getOperand(0).getValueType())) {
@@ -1657,6 +1691,20 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
        case ISD::SETULE: CondCode = BE; break;
        case ISD::SETUGE: CondCode = AE; break;
        }
+    } else if (X86ScalarSSE) {
+      switch (SetCC->getCondition()) {
+      default: assert(0 && "Unknown scalar fp comparison!");
+      case ISD::SETEQ:  CondCode = EQ; break;
+      case ISD::SETNE:  CondCode = NE; break;
+      case ISD::SETULT:
+      case ISD::SETLT:  CondCode = LT; break;
+      case ISD::SETULE:
+      case ISD::SETLE:  CondCode = LE; break;
+      case ISD::SETUGT:
+      case ISD::SETGT:  CondCode = GT; break;
+      case ISD::SETUGE:
+      case ISD::SETGE:  CondCode = GE; break;
+      }
      } else {
        // On a floating point condition, the flags are set as follows:
        // ZF  PF  CF   op
@@ -1693,6 +1741,79 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
      }
    }
  
+  // There's no SSE equivalent of FCMOVE.  In some cases we can fake it up, in
+  // Others we will have to do the PowerPC thing and generate an MBB for the
+  // true and false values and select between them with a PHI.
+  if (X86ScalarSSE) { 
+    if (CondCode != NOT_SET) {
+      unsigned CMPSOpc = (SVT == MVT::f64) ? X86::CMPSDrr : X86::CMPSSrr;
+      unsigned CMPSImm = SSE_CMOVTAB[CondCode];
+      // FIXME check for min
+      // FIXME check for max
+      // FIXME check for reverse
+      unsigned LHS = SelectExpr(Cond.getOperand(0));
+      unsigned RHS = SelectExpr(Cond.getOperand(1));
+      // emit compare mask
+      unsigned MaskReg = MakeReg(SVT);
+      BuildMI(BB, CMPSOpc, 3, MaskReg).addReg(LHS).addReg(RHS).addImm(CMPSImm);
+      // emit and with mask
+      unsigned TrueMask = MakeReg(SVT);
+      unsigned AndOpc = (SVT == MVT::f32) ? X86::ANDPSrr : X86::ANDPDrr;
+      BuildMI(BB, AndOpc, 2, TrueMask).addReg(RTrue).addReg(MaskReg);
+      // emit and with inverse mask
+      unsigned FalseMask = MakeReg(SVT);
+      unsigned AndnOpc = (SVT == MVT::f32) ? X86::ANDNPSrr : X86::ANDNPDrr;
+      BuildMI(BB, AndnOpc, 2, FalseMask).addReg(RFalse).addReg(MaskReg);
+      // emit or into dest reg
+      unsigned OROpc = (SVT == MVT::f32) ? X86::ORPSrr : X86::ORPDrr;
+      BuildMI(BB, OROpc, 2, RDest).addReg(TrueMask).addReg(FalseMask);
+      return;
+    } else {
+      // do the test and branch thing
+      // Get the condition into the zero flag.
+      unsigned CondReg = SelectExpr(Cond);
+      BuildMI(BB, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg);
+
+      // Create an iterator with which to insert the MBB for copying the false
+      // value and the MBB to hold the PHI instruction for this SetCC.
+      MachineBasicBlock *thisMBB = BB;
+      const BasicBlock *LLVM_BB = BB->getBasicBlock();
+      ilist<MachineBasicBlock>::iterator It = BB;
+      ++It;
+
+      //  thisMBB:
+      //  ...
+      //   TrueVal = ...
+      //   cmpTY ccX, r1, r2
+      //   bCC sinkMBB
+      //   fallthrough --> copy0MBB
+      MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+      MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+      BuildMI(BB, X86::JNE, 1).addMBB(sinkMBB);
+      MachineFunction *F = BB->getParent();
+      F->getBasicBlockList().insert(It, copy0MBB);
+      F->getBasicBlockList().insert(It, sinkMBB);
+      // Update machine-CFG edges
+      BB->addSuccessor(copy0MBB);
+      BB->addSuccessor(sinkMBB);
+
+      //  copy0MBB:
+      //   %FalseValue = ...
+      //   # fallthrough to sinkMBB
+      BB = copy0MBB;
+      // Update machine-CFG edges
+      BB->addSuccessor(sinkMBB);
+
+      //  sinkMBB:
+      //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+      //  ...
+      BB = sinkMBB;
+      BuildMI(BB, X86::PHI, 4, RDest).addReg(RFalse)
+        .addMBB(copy0MBB).addReg(RTrue).addMBB(thisMBB);
+    }
+    return;
+  }
+
    unsigned Opc = 0;
    if (CondCode != NOT_SET) {
      switch (SVT) {
@@ -1702,7 +1823,7 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
      case MVT::f64: Opc = CMOVTABFP[CondCode]; break;
      }
    }
-
+  
    // Finally, if we weren't able to fold this, just emit the condition and test
    // it.
    if (CondCode == NOT_SET || Opc == 0) {
@@ -1757,8 +1878,8 @@ void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) {
        return;
      }
    } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(RHS)) {
-    if (CN->isExactlyValue(+0.0) ||
-        CN->isExactlyValue(-0.0)) {
+    if (!X86ScalarSSE && (CN->isExactlyValue(+0.0) ||
+                          CN->isExactlyValue(-0.0))) {
        unsigned Reg = SelectExpr(LHS);
        BuildMI(BB, X86::FTST, 1).addReg(Reg);
        BuildMI(BB, X86::FNSTSW8r, 0);
@@ -1791,7 +1912,8 @@ void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) {
    case MVT::i8:  Opc = X86::CMP8rr;  break;
    case MVT::i16: Opc = X86::CMP16rr; break;
    case MVT::i32: Opc = X86::CMP32rr; break;
-  case MVT::f64: Opc = X86::FUCOMIr; break;
+  case MVT::f32: Opc = X86::UCOMISSrr; break;
+  case MVT::f64: Opc = X86ScalarSSE ? X86::UCOMISDrr : X86::FUCOMIr; break;
    }
    unsigned Tmp1, Tmp2;
    if (getRegPressure(LHS) > getRegPressure(RHS)) {
@@ -2040,6 +2162,11 @@ unsigned ISel::SelectExpr(SDOperand N) {
    default:
      Node->dump();
      assert(0 && "Node not handled!\n");
+  case ISD::FP_EXTEND:
+    assert(X86ScalarSSE && "Scalar SSE FP must be enabled to use f32"); 
+    Tmp1 = SelectExpr(N.getOperand(0));
+    BuildMI(BB, X86::CVTSS2SDrr, 1, Result).addReg(Tmp1);
+    return Result;
    case ISD::CopyFromReg:
      Select(N.getOperand(0));
      if (Result == 1) {
@@ -2212,6 +2339,37 @@ unsigned ISel::SelectExpr(SDOperand N) {
  
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP: {
+    Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
+    unsigned PromoteOpcode = 0;
+
+    // We can handle any sint to fp, and 8 and 16 uint to fp with the direct 
+    // sse conversion instructions.
+    if (X86ScalarSSE) {
+      MVT::ValueType SrcTy = N.getOperand(0).getValueType();
+      MVT::ValueType DstTy = N.getValueType();
+      switch (SrcTy) {
+      case MVT::i1:
+      case MVT::i8:
+        PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ?
+          X86::MOVZX32rr8 : X86::MOVSX32rr8;
+        break;
+      case MVT::i16:
+        PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ?
+          X86::MOVZX32rr16 : X86::MOVSX32rr16;
+        break;
+      default:
+        assert(N.getOpcode() != ISD::UINT_TO_FP);
+        break;
+      }
+      if (PromoteOpcode) {
+        BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
+        Tmp1 = Tmp2;
+      }
+      Opc = (DstTy == MVT::f64) ? X86::CVTSI2SDrr : X86::CVTSI2SSrr;
+      BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
+      return Result;
+    }
+    
      // FIXME: Most of this grunt work should be done by legalize!
      ContainsFPCode = true;
  
@@ -2221,8 +2379,6 @@ unsigned ISel::SelectExpr(SDOperand N) {
      //
      MVT::ValueType PromoteType = MVT::Other;
      MVT::ValueType SrcTy = N.getOperand(0).getValueType();
-    unsigned PromoteOpcode = 0;
-    unsigned RealDestReg = Result;
      switch (SrcTy) {
      case MVT::i1:
      case MVT::i8:
@@ -2245,8 +2401,6 @@ unsigned ISel::SelectExpr(SDOperand N) {
        break;
      }
  
-    Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
-
      if (PromoteType != MVT::Other) {
        Tmp2 = MakeReg(PromoteType);
        BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
@@ -2272,31 +2426,28 @@ unsigned ISel::SelectExpr(SDOperand N) {
        break;
      default: break; // No promotion required.
      }
-
-    if (Node->getOpcode() == ISD::UINT_TO_FP && Result != RealDestReg) {
-      // If this is a cast from uint -> double, we need to be careful when if
-      // the "sign" bit is set.  If so, we don't want to make a negative number,
-      // we want to make a positive number.  Emit code to add an offset if the
-      // sign bit is set.
-
-      // Compute whether the sign bit is set by shifting the reg right 31 bits.
-      unsigned IsNeg = MakeReg(MVT::i32);
-      BuildMI(BB, X86::SHR32ri, 2, IsNeg).addReg(Tmp1).addImm(31);
-
-      // Create a CP value that has the offset in one word and 0 in the other.
-      static ConstantInt *TheOffset = ConstantUInt::get(Type::ULongTy,
-                                                        0x4f80000000000000ULL);
-      unsigned CPI = F->getConstantPool()->getConstantPoolIndex(TheOffset);
-      BuildMI(BB, X86::FADD32m, 5, RealDestReg).addReg(Result)
-        .addConstantPoolIndex(CPI).addZImm(4).addReg(IsNeg).addSImm(0);
-    }
-    return RealDestReg;
+    return Result;
    }
    case ISD::FP_TO_SINT:
    case ISD::FP_TO_UINT: {
      // FIXME: Most of this grunt work should be done by legalize!
      Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
  
+    // If the target supports SSE2 and is performing FP operations in SSE regs
+    // instead of the FP stack, then we can use the efficient CVTSS2SI and
+    // CVTSD2SI instructions.
+    if (ISD::FP_TO_SINT == N.getOpcode() && X86ScalarSSE) {
+      if (MVT::f32 == N.getOperand(0).getValueType()) {
+        BuildMI(BB, X86::CVTSS2SIrr, 1, Result).addReg(Tmp1);
+      } else if (MVT::f64 == N.getOperand(0).getValueType()) {
+        BuildMI(BB, X86::CVTSD2SIrr, 1, Result).addReg(Tmp1);
+      } else {
+        assert(0 && "Not an f32 or f64?");
+        abort();
+      }
+      return Result;
+    } 
+
      // Change the floating point control register to use "round towards zero"
      // mode when truncating to an integer value.
      //
@@ -2385,9 +2536,15 @@ unsigned ISel::SelectExpr(SDOperand N) {
        case MVT::i8:  Opc = X86::ADD8rm;  break;
        case MVT::i16: Opc = X86::ADD16rm; break;
        case MVT::i32: Opc = X86::ADD32rm; break;
+      case MVT::f32: Opc = X86::ADDSSrm; break;
        case MVT::f64:
          // For F64, handle promoted load operations (from F32) as well!
-        Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
+        if (X86ScalarSSE) {
+          assert(Op1.getOpcode() == ISD::LOAD && "SSE load not promoted");
+          Opc = X86::ADDSDrm;
+        } else {
+          Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
+        }
          break;
        }
        X86AddressMode AM;
@@ -2458,7 +2615,8 @@ unsigned ISel::SelectExpr(SDOperand N) {
      case MVT::i8:  Opc = X86::ADD8rr; break;
      case MVT::i16: Opc = X86::ADD16rr; break;
      case MVT::i32: Opc = X86::ADD32rr; break;
-    case MVT::f64: Opc = X86::FpADD; break;
+    case MVT::f32: Opc = X86::ADDSSrr; break;
+    case MVT::f64: Opc = X86ScalarSSE ? X86::ADDSDrr : X86::FpADD; break;
      }
  
      if (getRegPressure(Op0) > getRegPressure(Op1)) {
@@ -2472,18 +2630,29 @@ unsigned ISel::SelectExpr(SDOperand N) {
      BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
      return Result;
  
+  case ISD::FSQRT:
+    Tmp1 = SelectExpr(Node->getOperand(0));
+    if (X86ScalarSSE) {
+      Opc = (N.getValueType() == MVT::f32) ? X86::SQRTSSrr : X86::SQRTSDrr;
+      BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
+    } else {
+      BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1);
+    }
+    return Result;
+
+  // FIXME:
+  // Once we can spill 16 byte constants into the constant pool, we can
+  // implement SSE equivalents of FABS and FCHS.
    case ISD::FABS:
    case ISD::FNEG:
    case ISD::FSIN:
    case ISD::FCOS:
-  case ISD::FSQRT:
      assert(N.getValueType()==MVT::f64 && "Illegal type for this operation");
      Tmp1 = SelectExpr(Node->getOperand(0));
      switch (N.getOpcode()) {
      default: assert(0 && "Unreachable!");
      case ISD::FABS: BuildMI(BB, X86::FABS, 1, Result).addReg(Tmp1); break;
      case ISD::FNEG: BuildMI(BB, X86::FCHS, 1, Result).addReg(Tmp1); break;
-    case ISD::FSQRT: BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1); break;
      case ISD::FSIN: BuildMI(BB, X86::FSIN, 1, Result).addReg(Tmp1); break;
      case ISD::FCOS: BuildMI(BB, X86::FCOS, 1, Result).addReg(Tmp1); break;
      }
@@ -2550,11 +2719,21 @@ unsigned ISel::SelectExpr(SDOperand N) {
        X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::FSUB32m, X86::FSUB64m,
        X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::FpSUB  , X86::FpSUB,
      };
+    static const unsigned SSE_SUBTab[] = {
+      X86::SUB8ri, X86::SUB16ri, X86::SUB32ri, 0, 0,
+      X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::SUBSSrm, X86::SUBSDrm,
+      X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::SUBSSrr, X86::SUBSDrr,
+    };
      static const unsigned MULTab[] = {
        0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
        0, X86::IMUL16rm , X86::IMUL32rm, X86::FMUL32m, X86::FMUL64m,
        0, X86::IMUL16rr , X86::IMUL32rr, X86::FpMUL  , X86::FpMUL,
      };
+    static const unsigned SSE_MULTab[] = {
+      0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
+      0, X86::IMUL16rm , X86::IMUL32rm, X86::MULSSrm, X86::MULSDrm,
+      0, X86::IMUL16rr , X86::IMUL32rr, X86::MULSSrr, X86::MULSDrr,
+    };
      static const unsigned ANDTab[] = {
        X86::AND8ri, X86::AND16ri, X86::AND32ri, 0, 0,
        X86::AND8rm, X86::AND16rm, X86::AND32rm, 0, 0,
@@ -2637,8 +2816,8 @@ unsigned ISel::SelectExpr(SDOperand N) {
        }
        switch (Node->getOpcode()) {
        default: assert(0 && "Unreachable!");
-      case ISD::SUB: Opc = SUBTab[Opc]; break;
-      case ISD::MUL: Opc = MULTab[Opc]; break;
+      case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+      case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
        case ISD::AND: Opc = ANDTab[Opc]; break;
        case ISD::OR:  Opc =  ORTab[Opc]; break;
        case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2656,7 +2835,7 @@ unsigned ISel::SelectExpr(SDOperand N) {
          goto FoldOps;
        } else {
          // For FP, emit 'reverse' subract, with a memory operand.
-        if (N.getValueType() == MVT::f64) {
+        if (N.getValueType() == MVT::f64 && !X86ScalarSSE) {
            if (Op0.getOpcode() == ISD::EXTLOAD)
              Opc = X86::FSUBR32m;
            else
@@ -2678,13 +2857,17 @@ unsigned ISel::SelectExpr(SDOperand N) {
        case MVT::i8:  Opc = 5; break;
        case MVT::i16: Opc = 6; break;
        case MVT::i32: Opc = 7; break;
+      case MVT::f32: Opc = 8; break;
          // For F64, handle promoted load operations (from F32) as well!
-      case MVT::f64: Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
+      case MVT::f64: 
+        assert((!X86ScalarSSE || Op1.getOpcode() == ISD::LOAD) && 
+               "SSE load should have been promoted");
+        Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
        }
        switch (Node->getOpcode()) {
        default: assert(0 && "Unreachable!");
-      case ISD::SUB: Opc = SUBTab[Opc]; break;
-      case ISD::MUL: Opc = MULTab[Opc]; break;
+      case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+      case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
        case ISD::AND: Opc = ANDTab[Opc]; break;
        case ISD::OR:  Opc =  ORTab[Opc]; break;
        case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2725,8 +2908,8 @@ unsigned ISel::SelectExpr(SDOperand N) {
      }
      switch (Node->getOpcode()) {
      default: assert(0 && "Unreachable!");
-    case ISD::SUB: Opc = SUBTab[Opc]; break;
-    case ISD::MUL: Opc = MULTab[Opc]; break;
+    case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+    case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
      case ISD::AND: Opc = ANDTab[Opc]; break;
      case ISD::OR:  Opc =  ORTab[Opc]; break;
      case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2844,7 +3027,7 @@ unsigned ISel::SelectExpr(SDOperand N) {
  
      if (N.getOpcode() == ISD::SDIV) {
        // We can fold loads into FpDIVs, but not really into any others.
-      if (N.getValueType() == MVT::f64) {
+      if (N.getValueType() == MVT::f64 || !X86ScalarSSE) {
          // Check for reversed and unreversed DIV.
          if (isFoldableLoad(N.getOperand(0), N.getOperand(1), true)) {
            if (N.getOperand(0).getOpcode() == ISD::EXTLOAD)
@@ -2962,8 +3145,12 @@ unsigned ISel::SelectExpr(SDOperand N) {
        ClrOpcode = X86::MOV32ri;
        SExtOpcode = X86::CDQ;
        break;
+    case MVT::f32:
+      BuildMI(BB, X86::DIVSSrr, 2, Result).addReg(Tmp1).addReg(Tmp2);
+      return Result;
      case MVT::f64:
-      BuildMI(BB, X86::FpDIV, 2, Result).addReg(Tmp1).addReg(Tmp2);
+      Opc = X86ScalarSSE ? X86::DIVSDrr : X86::FpDIV;
+      BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
        return Result;
      }
  
@@ -3108,7 +3295,15 @@ unsigned ISel::SelectExpr(SDOperand N) {
      case MVT::i8:  Opc = X86::MOV8rm; break;
      case MVT::i16: Opc = X86::MOV16rm; break;
      case MVT::i32: Opc = X86::MOV32rm; break;
-    case MVT::f64: Opc = X86::FLD64m; ContainsFPCode = true; break;
+    case MVT::f32: Opc = X86::MOVSSrm; break;
+    case MVT::f64: 
+      if (X86ScalarSSE) {
+        Opc = X86::MOVSDrm;
+      } else {
+        Opc = X86::FLD64m;
+        ContainsFPCode = true; 
+      }
+      break;
      }
  
      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N.getOperand(1))){
@@ -3385,9 +3580,21 @@ unsigned ISel::SelectExpr(SDOperand N) {
            BuildMI(BB, X86::MOV32rr, 1, Result+1).addReg(X86::EDX);
          break;
        case MVT::f64:     // Floating-point return values live in %ST(0)
-        ContainsFPCode = true;
-        BuildMI(BB, X86::FpGETRESULT, 1, Result);
-        break;
+        if (X86ScalarSSE) {
+          ContainsFPCode = true;
+          BuildMI(BB, X86::FpGETRESULT, 1, X86::FP0);
+
+          unsigned Size = MVT::getSizeInBits(MVT::f64)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::FST64m, 5), FrameIdx).addReg(X86::FP0);
+          addFrameReference(BuildMI(BB, X86::MOVSDrm, 4, Result), FrameIdx);
+          break;
+        } else {
+          ContainsFPCode = true;
+          BuildMI(BB, X86::FpGETRESULT, 1, Result);
+          break;
+        }
        }
      return Result+N.ResNo-1;
    }
@@ -3977,7 +4184,15 @@ void ISel::Select(SDOperand N) {
        case MVT::i8:  Opc = X86::MOV8rr; break;
        case MVT::i16: Opc = X86::MOV16rr; break;
        case MVT::i32: Opc = X86::MOV32rr; break;
-      case MVT::f64: Opc = X86::FpMOV; ContainsFPCode = true; break;
+      case MVT::f32: Opc = X86::MOVAPSrr; break;
+      case MVT::f64: 
+        if (X86ScalarSSE) {
+          Opc = X86::MOVAPDrr;
+        } else {
+          Opc = X86::FpMOV; 
+          ContainsFPCode = true; 
+        }
+        break;
        }
        BuildMI(BB, Opc, 1, Tmp2).addReg(Tmp1);
      }
@@ -4018,12 +4233,38 @@ void ISel::Select(SDOperand N) {
        }
        switch (N.getOperand(1).getValueType()) {
        default: assert(0 && "All other types should have been promoted!!");
+      case MVT::f32:
+        if (X86ScalarSSE) {
+          // Spill the value to memory and reload it into top of stack.
+          unsigned Size = MVT::getSizeInBits(MVT::f32)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::MOVSSmr, 5), FrameIdx).addReg(Tmp1);
+          addFrameReference(BuildMI(BB, X86::FLD32m, 4, X86::FP0), FrameIdx);
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0);
+          ContainsFPCode = true; 
+        } else {
+          assert(0 && "MVT::f32 only legal with scalar sse fp");
+          abort();
+        }
+        break;
        case MVT::f64:
-       BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
-       break;
+        if (X86ScalarSSE) {
+          // Spill the value to memory and reload it into top of stack.
+          unsigned Size = MVT::getSizeInBits(MVT::f64)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::MOVSDmr, 5), FrameIdx).addReg(Tmp1);
+          addFrameReference(BuildMI(BB, X86::FLD64m, 4, X86::FP0), FrameIdx);
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0);
+          ContainsFPCode = true; 
+        } else {
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
+        }
+        break;
        case MVT::i32:
-       BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
-       break;
+        BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
+        break;
        }
        break;
      case 1:
@@ -4144,7 +4385,9 @@ void ISel::Select(SDOperand N) {
      switch (StoredTy) {
      default: assert(0 && "Cannot truncstore this type!");
      case MVT::i1: Opc = X86::MOV8mr; break;
-    case MVT::f32: Opc = X86::FST32m; break;
+    case MVT::f32:
+      assert(!X86ScalarSSE && "Cannot truncstore scalar SSE regs"); 
+      Opc = X86::FST32m; break;
      }
  
      std::vector<std::pair<unsigned, unsigned> > RP;
@@ -4176,7 +4419,6 @@ void ISel::Select(SDOperand N) {
        case MVT::i8:  Opc = X86::MOV8mi; break;
        case MVT::i16: Opc = X86::MOV16mi; break;
        case MVT::i32: Opc = X86::MOV32mi; break;
-      case MVT::f64: break;
        }
        if (Opc) {
          if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(2))) {
@@ -4215,7 +4457,8 @@ void ISel::Select(SDOperand N) {
      case MVT::i8:  Opc = X86::MOV8mr; break;
      case MVT::i16: Opc = X86::MOV16mr; break;
      case MVT::i32: Opc = X86::MOV32mr; break;
-    case MVT::f64: Opc = X86::FST64m; break;
+    case MVT::f32: Opc = X86::MOVSSmr; break;
+    case MVT::f64: Opc = X86ScalarSSE ? X86::MOVSDmr : X86::FST64m; break;
      }
  
      std::vector<std::pair<unsigned, unsigned> > RP;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index bda2cb73f6f80d3ade8ec182b0a85e3cedf2808f..957360b201366a77166ffd328ec4b8de2209f93b 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -28,7 +28,7 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
                                 unsigned& destReg) const {
    MachineOpCode oc = MI.getOpcode();
    if (oc == X86::MOV8rr || oc == X86::MOV16rr || oc == X86::MOV32rr ||
-      oc == X86::FpMOV) {
+      oc == X86::FpMOV  || oc == X86::MOVAPDrr) {
        assert(MI.getNumOperands() == 2 &&
               MI.getOperand(0).isRegister() &&
               MI.getOperand(1).isRegister() &&
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h

index 5b63ff93f5c7f7f10fea916b2b0ce6a29aa0d661..95e8205a00b68f8cd4557ffd097c5e1e720b82b0 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -107,6 +107,10 @@ namespace X86II {
      DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
      DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
      DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+    
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,   XS = 12 << Op0Shift,
  
      //===------------------------------------------------------------------===//
      // This two-bit field describes the size of an immediate operand.  Zero is
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td

index 39a4317bc2b72590e85c032ffc6823c102f734cb..1376d8fe8f0c3abbe38213d8d8e43c6b53da1013 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -187,7 +187,8 @@ def JG  : IBr<0x8F, (ops i32imm:$dst), "jg $dst">, TB;
  //
  let isCall = 1 in
    // All calls clobber the non-callee saved registers...
-  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0] in {
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              XMM0, XMM1, XMM2, XMM3] in {
      def CALLpcrel32 : I<0xE8, RawFrm, (ops calltarget:$dst), "call $dst">;
      def CALL32r     : I<0xFF, MRM2r, (ops R32:$dst), "call {*}$dst">;
      def CALL32m     : I<0xFF, MRM2m, (ops i32mem:$dst), "call {*}$dst">;
@@ -1436,6 +1437,23 @@ def CVTSS2SDrr: I<0x5A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
                  "cvtss2sd {$src, $dst|$dst, $src}">, XD;
  def CVTSS2SDrm: I<0x5A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
                  "cvtss2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SSrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SSrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SDrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SDrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f64mem:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+
+def SQRTSSrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSSrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSDrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+def SQRTSDrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
  
  def UCOMISDrr: I<0x2E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
                  "ucomisd {$src, $dst|$dst, $src}">, TB, OpSize;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp

index 08920cc2605893da3fd5ac2bbb227ba425fbf8f6..230debf7a7a9fdc3085af86ed5495aa1af82b567 100644 (file)
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -52,6 +52,7 @@ static unsigned getIdx(unsigned SpillSize) {
    case 32: return 2;
    case 64: return 3;   // FP in 64-bit spill mode.
    case 80: return 4;   // FP in 80-bit spill mode.
+  case 128: return 5;  // XMM reg in 128 bit mode.
    }
  }
  
@@ -59,18 +60,24 @@ void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
                                            unsigned SrcReg, int FrameIdx) const {
    static const unsigned Opcode[] =
-    { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m };
+    { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m,
+      X86::MOVAPDmr };
    unsigned Idx = getIdx(getSpillSize(SrcReg));
-  addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 5), FrameIdx).addReg(SrcReg);
+  unsigned Opc = Opcode[Idx];
+  if (X86ScalarSSE && Opc == X86::FST64m) Opc = X86::MOVSDmr;
+  addFrameReference(BuildMI(MBB, MI, Opc, 5), FrameIdx).addReg(SrcReg);
  }
  
  void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                             MachineBasicBlock::iterator MI,
                                             unsigned DestReg, int FrameIdx)const{
    static const unsigned Opcode[] =
-    { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m };
+    { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m,
+      X86::MOVAPDrm };
    unsigned Idx = getIdx(getSpillSize(DestReg));
-  addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 4, DestReg), FrameIdx);
+  unsigned Opc = Opcode[Idx];
+  if (X86ScalarSSE && Opc == X86::FLD64m) Opc = X86::MOVSDrm;
+  addFrameReference(BuildMI(MBB, MI, Opc, 4, DestReg), FrameIdx);
  }
  
  void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -78,8 +85,11 @@ void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
                                     unsigned DestReg, unsigned SrcReg,
                                     const TargetRegisterClass *RC) const {
    static const unsigned Opcode[] =
-    { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV };
-  BuildMI(MBB, MI, Opcode[getIdx(RC->getSize()*8)], 1, DestReg).addReg(SrcReg);
+    { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV,
+      X86::MOVAPDrr };
+  unsigned Opc = Opcode[getIdx(RC->getSize()*8)];
+  if (X86ScalarSSE && Opc == X86::FpMOV) Opc = X86::MOVAPDrr;
+  BuildMI(MBB, MI, Opc, 1, DestReg).addReg(SrcReg);
  }
  
  static MachineInstr *MakeMInst(unsigned Opcode, unsigned FrameIndex,
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td

index da8e612daef31349c20be74ace3c2ba1a5753d1f..30190fc18c4d1987c98ca8744b0e8f1d24c77184 100644 (file)
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -99,8 +99,8 @@ def R32 : RegisterClass<i32, 32, [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
  // FIXME: These registers can contain both integer and fp values.  We should
  // figure out the right way to deal with that.  For now, since they'll be used
  // for scalar FP, they are being declared f64
-def RXMM : RegisterClass<f64, 128, [XMM0, XMM1, XMM2, XMM3, 
-                                    XMM4, XMM5, XMM6, XMM7]>;
+def RXMM : RegisterClass<f64, 32, [XMM0, XMM1, XMM2, XMM3, 
+                                   XMM4, XMM5, XMM6, XMM7]>;
  
  // FIXME: This sets up the floating point register files as though they are f64
  // values, though they really are f80 values.  This will cause us to spill
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp

index 2330182372c5a7004eb0f17f2aa9baf4ac4d21b4..def4f9cfa49088f0e11990ec4b26bd49c127c376 100644 (file)
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -26,6 +26,7 @@
  using namespace llvm;
  
  X86VectorEnum llvm::X86Vector = NoSSE;
+bool llvm::X86ScalarSSE = false;
  
  /// X86TargetMachineModule - Note that this is used on hosts that cannot link
  /// in a library unless there are references into the library.  In particular,
@@ -41,8 +42,11 @@ namespace {
    cl::opt<bool> DisableOutput("disable-x86-llc-output", cl::Hidden,
                                cl::desc("Disable the X86 asm printer, for use "
                                         "when profiling the code generator."));
+  cl::opt<bool, true> EnableSSEFP("enable-sse-scalar-fp",
+                cl::desc("Perform FP math in SSE regs instead of the FP stack"),
+                cl::location(X86ScalarSSE),
+                cl::init(false));
  
-#if 0
    // FIXME: This should eventually be handled with target triples and
    // subtarget support!
    cl::opt<X86VectorEnum, true>
@@ -54,7 +58,6 @@ namespace {
         clEnumValN(SSE3, "sse3", "  Enable SSE, SSE2, and SSE3 support"),
         clEnumValEnd),
      cl::location(X86Vector), cl::init(NoSSE));
-#endif
  
    // Register the target.
    RegisterTarget<X86TargetMachine> X("x86", "  IA-32 (Pentium and above)");
@@ -91,6 +94,8 @@ X86TargetMachine::X86TargetMachine(const Module &M, IntrinsicLowering *IL)
    : TargetMachine("X86", IL, true, 4, 4, 4, 4, 4),
      FrameInfo(TargetFrameInfo::StackGrowsDown, 8, -4),
      JITInfo(*this) {
+  // Scalar SSE FP requires at least SSE2
+  X86ScalarSSE &= X86Vector >= SSE2;
  }
author	Nate Begeman <natebegeman@mac.com>
	Wed, 6 Jul 2005 18:59:04 +0000 (18:59 +0000)
committer	Nate Begeman <natebegeman@mac.com>
	Wed, 6 Jul 2005 18:59:04 +0000 (18:59 +0000)
lib/Target/TargetMachine.cpp		patch \| blob \| history
lib/Target/X86/X86.h		patch \| blob \| history
lib/Target/X86/X86.td		patch \| blob \| history
lib/Target/X86/X86CodeEmitter.cpp		patch \| blob \| history
lib/Target/X86/X86ISelPattern.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrInfo.h		patch \| blob \| history
lib/Target/X86/X86InstrInfo.td		patch \| blob \| history
lib/Target/X86/X86RegisterInfo.cpp		patch \| blob \| history
lib/Target/X86/X86RegisterInfo.td		patch \| blob \| history
lib/Target/X86/X86TargetMachine.cpp		patch \| blob \| history