CellSPU:

[oota-llvm.git] / lib / Target / CellSPU / SPUISelLowering.cpp
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp

index 1cd00978eefcd22d4ada5fd85239732e80f772d4..c3c31e0f47090d398d001c6b1b6bb35ab351f906 100644 (file)
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -39,8 +39,8 @@ namespace {
  
    //! MVT mapping to useful data for Cell SPU
    struct valtype_map_s {
-    const MVT        valtype;
-    const int                   prefslot_byte;
+    const MVT   valtype;
+    const int   prefslot_byte;
    };
  
    const valtype_map_s valtype_map[] = {
@@ -130,9 +130,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
    addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
  
-  // Initialize libcalls:
-  setLibcallName(RTLIB::MUL_I64, "__muldi3");
-
    // SPU has no sign or zero extended loads for i1, i8, i16:
    setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -151,6 +148,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
    setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
  
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Custom);
+
    // SPU constant load actions are custom lowered:
    setOperationAction(ISD::Constant,   MVT::i64, Custom);
    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -165,14 +164,21 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
      setOperationAction(ISD::STORE, VT, Custom);
    }
  
-  // Custom lower BRCOND for i1, i8 to "promote" the result to
-  // i32 and i16, respectively.
+  // Custom lower BRCOND for i8 to "promote" the result to i16
    setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  
    // Expand the jumptable branches
    setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
    setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+
+  // Custom lower SELECT_CC for most cases, but expand by default
    setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
+#if 0
+  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
+#endif
  
    // SPU has no intrinsics for these particular operations:
    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
@@ -215,7 +221,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::SHL,  MVT::i8,     Custom);
    setOperationAction(ISD::SRL,  MVT::i8,     Custom);
    setOperationAction(ISD::SRA,  MVT::i8,     Custom);
-  // And SPU needs custom lowering for shift left/right for i64
+
+  // SPU needs custom lowering for shift left/right for i64
    setOperationAction(ISD::SHL,  MVT::i64,    Custom);
    setOperationAction(ISD::SRL,  MVT::i64,    Custom);
    setOperationAction(ISD::SRA,  MVT::i64,    Custom);
@@ -223,7 +230,15 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    // Custom lower i8, i32 and i64 multiplications
    setOperationAction(ISD::MUL,  MVT::i8,     Custom);
    setOperationAction(ISD::MUL,  MVT::i32,    Custom);
-  setOperationAction(ISD::MUL,  MVT::i64,    Expand);
+  setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall
+
+  // SMUL_LOHI, UMUL_LOHI
+#if 0
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+#endif
  
    // Need to custom handle (some) common i8, i64 math ops
    setOperationAction(ISD::ADD,  MVT::i64,    Custom);
@@ -247,13 +262,11 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  
    // SPU has a version of select that implements (a&~c)|(b&c), just like
    // select ought to work:
-  setOperationAction(ISD::SELECT, MVT::i1,   Promote);
    setOperationAction(ISD::SELECT, MVT::i8,   Legal);
    setOperationAction(ISD::SELECT, MVT::i16,  Legal);
    setOperationAction(ISD::SELECT, MVT::i32,  Legal);
    setOperationAction(ISD::SELECT, MVT::i64,  Expand);
  
-  setOperationAction(ISD::SETCC, MVT::i1,    Promote);
    setOperationAction(ISD::SETCC, MVT::i8,    Legal);
    setOperationAction(ISD::SETCC, MVT::i16,   Legal);
    setOperationAction(ISD::SETCC, MVT::i32,   Legal);
@@ -265,6 +278,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
    setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);
  
+  // Custom lower truncates
+  setOperationAction(ISD::TRUNCATE, MVT::i8, Custom);
+  setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
+  setOperationAction(ISD::TRUNCATE, MVT::i32, Custom);
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
+
    // SPU has a legal FP -> signed INT instruction
    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
@@ -299,7 +318,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  
    // We want to legalize GlobalAddress and ConstantPool nodes into the
    // appropriate instructions to materialize the address.
-  for (unsigned sctype = (unsigned) MVT::i1; sctype < (unsigned) MVT::f128;
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
         ++sctype) {
      MVT VT = (MVT::SimpleValueType)sctype;
  
@@ -383,7 +402,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
  
    setShiftAmountType(MVT::i32);
-  setSetCCResultContents(ZeroOrOneSetCCResult);
+  setBooleanContents(ZeroOrOneBooleanContent);
  
    setStackPointerRegisterToSaveRestore(SPU::R1);
  
@@ -394,6 +413,11 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setTargetDAGCombine(ISD::ANY_EXTEND);
  
    computeRegisterProperties();
+
+  // Set pre-RA register scheduler default to BURR, which produces slightly
+  // better code than the default (could also be TDRR, but TargetLowering.h
+  // needs a mod to support that model):
+  setSchedulingPreference(SchedulingForRegPressure);
  }
  
  const char *
@@ -409,16 +433,10 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
      node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
      node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
      node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
-    node_names[(unsigned) SPUISD::INSERT_MASK] = "SPUISD::INSERT_MASK";
+    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
      node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
      node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR";
-    node_names[(unsigned) SPUISD::EXTRACT_ELT0] = "SPUISD::EXTRACT_ELT0";
-    node_names[(unsigned) SPUISD::EXTRACT_ELT0_CHAINED]
-                                              = "SPUISD::EXTRACT_ELT0_CHAINED";
-    node_names[(unsigned) SPUISD::EXTRACT_I1_ZEXT] = "SPUISD::EXTRACT_I1_ZEXT";
-    node_names[(unsigned) SPUISD::EXTRACT_I1_SEXT] = "SPUISD::EXTRACT_I1_SEXT";
-    node_names[(unsigned) SPUISD::EXTRACT_I8_ZEXT] = "SPUISD::EXTRACT_I8_ZEXT";
-    node_names[(unsigned) SPUISD::EXTRACT_I8_SEXT] = "SPUISD::EXTRACT_I8_SEXT";
+    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
      node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
      node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
      node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
@@ -434,11 +452,7 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
        "SPUISD::ROTQUAD_RZ_BYTES";
      node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] =
        "SPUISD::ROTQUAD_RZ_BITS";
-    node_names[(unsigned) SPUISD::ROTBYTES_RIGHT_S] =
-      "SPUISD::ROTBYTES_RIGHT_S";
      node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_CHAINED] =
-      "SPUISD::ROTBYTES_LEFT_CHAINED";
      node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
        "SPUISD::ROTBYTES_LEFT_BITS";
      node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
@@ -535,7 +549,6 @@ AlignedLoad(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST,
      alignOffs = int(FIN->getIndex() * SPUFrameInfo::stackSlotSize());
      prefSlotOffs = (int) (alignOffs & 0xf);
      prefSlotOffs -= vtm->prefslot_byte;
-    basePtr = DAG.getRegister(SPU::R1, VT);
    } else {
      alignOffs = 0;
      prefSlotOffs = -vtm->prefslot_byte;
@@ -576,13 +589,24 @@ AlignedLoad(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST,
  /*!
   All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
   within a 16-byte block, we have to rotate to extract the requested element.
- */
+
+ For extending loads, we also want to ensure that the following sequence is
+ emitted, e.g. for MVT::f32 extending load to MVT::f64:
+
+\verbatim
+%1  v16i8,ch = load 
+%2  v16i8,ch = rotate %1
+%3  v4f8, ch = bitconvert %2 
+%4  f32      = vec2perfslot %3
+%5  f64      = fp_extend %4
+\endverbatim
+*/
  static SDValue
  LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    LoadSDNode *LN = cast<LoadSDNode>(Op);
    SDValue the_chain = LN->getChain();
-  MVT VT = LN->getMemoryVT();
-  MVT OpVT = Op.getNode()->getValueType(0);
+  MVT InVT = LN->getMemoryVT();
+  MVT OutVT = Op.getValueType();
    ISD::LoadExtType ExtType = LN->getExtensionType();
    unsigned alignment = LN->getAlignment();
    SDValue Ops[8];
@@ -592,7 +616,8 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      int offset, rotamt;
      bool was16aligned;
      SDValue result =
-      AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, VT, was16aligned);
+      AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, InVT,
+                  was16aligned);
  
      if (result.getNode() == 0)
        return result;
@@ -604,57 +629,40 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      if (rotamt != 0 || !was16aligned) {
        SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other);
  
-      Ops[0] = the_chain;
-      Ops[1] = result;
+      Ops[0] = result;
        if (was16aligned) {
-        Ops[2] = DAG.getConstant(rotamt, MVT::i16);
+        Ops[1] = DAG.getConstant(rotamt, MVT::i16);
        } else {
          MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
          LoadSDNode *LN1 = cast<LoadSDNode>(result);
-        Ops[2] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(),
+        Ops[1] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(),
                               DAG.getConstant(rotamt, PtrVT));
        }
  
-      result = DAG.getNode(SPUISD::ROTBYTES_LEFT_CHAINED, vecvts, Ops, 3);
-      the_chain = result.getValue(1);
+      result = DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v16i8, Ops, 2);
      }
  
-    if (VT == OpVT || ExtType == ISD::EXTLOAD) {
-      SDVTList scalarvts;
-      MVT vecVT = MVT::v16i8;
+    // Convert the loaded v16i8 vector to the appropriate vector type
+    // specified by the operand:
+    MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
+    result = DAG.getNode(SPUISD::VEC2PREFSLOT, InVT,
+                        DAG.getNode(ISD::BIT_CONVERT, vecVT, result));
  
-      // Convert the loaded v16i8 vector to the appropriate vector type
-      // specified by the operand:
-      if (OpVT == VT) {
-        if (VT != MVT::i1)
-          vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
-      } else
-        vecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+    // Handle extending loads by extending the scalar result:
+    if (ExtType == ISD::SEXTLOAD) {
+      result = DAG.getNode(ISD::SIGN_EXTEND, OutVT, result);
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      result = DAG.getNode(ISD::ZERO_EXTEND, OutVT, result);
+    } else if (ExtType == ISD::EXTLOAD) {
+      unsigned NewOpc = ISD::ANY_EXTEND;
  
-      Ops[0] = the_chain;
-      Ops[1] = DAG.getNode(ISD::BIT_CONVERT, vecVT, result);
-      scalarvts = DAG.getVTList((OpVT == VT ? VT : OpVT), MVT::Other);
-      result = DAG.getNode(SPUISD::EXTRACT_ELT0_CHAINED, scalarvts, Ops, 2);
-      the_chain = result.getValue(1);
-    } else {
-      // Handle the sign and zero-extending loads for i1 and i8:
-      unsigned NewOpC;
+      if (OutVT.isFloatingPoint())
+       NewOpc = ISD::FP_EXTEND;
  
-      if (ExtType == ISD::SEXTLOAD) {
-        NewOpC = (OpVT == MVT::i1
-                  ? SPUISD::EXTRACT_I1_SEXT
-                  : SPUISD::EXTRACT_I8_SEXT);
-      } else {
-        assert(ExtType == ISD::ZEXTLOAD);
-        NewOpC = (OpVT == MVT::i1
-                  ? SPUISD::EXTRACT_I1_ZEXT
-                  : SPUISD::EXTRACT_I8_ZEXT);
-      }
-
-      result = DAG.getNode(NewOpC, OpVT, result);
+      result = DAG.getNode(NewOpc, OutVT, result);
      }
  
-    SDVTList retvts = DAG.getVTList(OpVT, MVT::Other);
+    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
      SDValue retops[2] = {
        result,
        the_chain
@@ -699,8 +707,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      int chunk_offset, slot_offset;
      bool was16aligned;
  
-    // The vector type we really want to load from the 16-byte chunk, except
-    // in the case of MVT::i1, which has to be v16i8.
+    // The vector type we really want to load from the 16-byte chunk.
      MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
          stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
  
@@ -747,18 +754,19 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      }
  
      SDValue insertEltOp =
-            DAG.getNode(SPUISD::INSERT_MASK, stVecVT, insertEltPtr);
+            DAG.getNode(SPUISD::SHUFFLE_MASK, vecVT, insertEltPtr);
      SDValue vectorizeOp =
              DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue);
  
-    result = DAG.getNode(SPUISD::SHUFB, vecVT, vectorizeOp, alignLoadVec,
-                         DAG.getNode(ISD::BIT_CONVERT, vecVT, insertEltOp));
+    result = DAG.getNode(SPUISD::SHUFB, vecVT,
+                        vectorizeOp, alignLoadVec,
+                        DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, insertEltOp));
  
      result = DAG.getStore(the_chain, result, basePtr,
                            LN->getSrcValue(), LN->getSrcValueOffset(),
                            LN->isVolatile(), LN->getAlignment());
  
-#if 0 && defined(NDEBUG)
+#if 0 && !defined(NDEBUG)
      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
        const SDValue &currentRoot = DAG.getRoot();
  
@@ -769,7 +777,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
        DAG.setRoot(currentRoot);
      }
  #endif
-    
+
      return result;
      /*UNREACHED*/
    }
@@ -873,11 +881,11 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  static SDValue
  LowerConstant(SDValue Op, SelectionDAG &DAG) {
    MVT VT = Op.getValueType();
-  ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
  
    if (VT == MVT::i64) {
-    SDValue T = DAG.getConstant(CN->getZExtValue(), MVT::i64);
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
+    SDValue T = DAG.getConstant(CN->getZExtValue(), VT);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
                         DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
    } else {
      cerr << "LowerConstant: unhandled constant type "
@@ -894,21 +902,24 @@ LowerConstant(SDValue Op, SelectionDAG &DAG) {
  static SDValue
  LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
    MVT VT = Op.getValueType();
-  ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
-
-  assert((FP != 0) &&
-         "LowerConstantFP: Node is not ConstantFPSDNode");
  
    if (VT == MVT::f64) {
+    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
+
+    assert((FP != 0) &&
+           "LowerConstantFP: Node is not ConstantFPSDNode");
+    
      uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
-    return DAG.getNode(ISD::BIT_CONVERT, VT,
-                       LowerConstant(DAG.getConstant(dbits, MVT::i64), DAG));
+    SDValue T = DAG.getConstant(dbits, MVT::i64);
+    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64, Tvec));
    }
  
    return SDValue();
  }
  
-//! Lower MVT::i1, MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
+//! Lower MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
  static SDValue
  LowerBRCOND(SDValue Op, SelectionDAG &DAG)
  {
@@ -916,8 +927,8 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG)
    MVT CondVT = Cond.getValueType();
    MVT CondNVT;
  
-  if (CondVT == MVT::i1 || CondVT == MVT::i8) {
-    CondNVT = (CondVT == MVT::i1 ? MVT::i32 : MVT::i16);
+  if (CondVT == MVT::i8) {
+    CondNVT = MVT::i16;
      return DAG.getNode(ISD::BRCOND, Op.getValueType(),
                        Op.getOperand(0),
                        DAG.getNode(ISD::ZERO_EXTEND, CondNVT, Op.getOperand(1)),
@@ -957,37 +968,37 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
  
        switch (ObjectVT.getSimpleVT()) {
        default: {
-       cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-            << ObjectVT.getMVTString()
-            << "\n";
-       abort();
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << ObjectVT.getMVTString()
+             << "\n";
+        abort();
        }
        case MVT::i8:
-       ArgRegClass = &SPU::R8CRegClass;
-       break;
+        ArgRegClass = &SPU::R8CRegClass;
+        break;
        case MVT::i16:
-       ArgRegClass = &SPU::R16CRegClass;
-       break;
+        ArgRegClass = &SPU::R16CRegClass;
+        break;
        case MVT::i32:
-       ArgRegClass = &SPU::R32CRegClass;
-       break;
+        ArgRegClass = &SPU::R32CRegClass;
+        break;
        case MVT::i64:
-       ArgRegClass = &SPU::R64CRegClass;
-       break;
+        ArgRegClass = &SPU::R64CRegClass;
+        break;
        case MVT::f32:
-       ArgRegClass = &SPU::R32FPRegClass;
-       break;
+        ArgRegClass = &SPU::R32FPRegClass;
+        break;
        case MVT::f64:
-       ArgRegClass = &SPU::R64FPRegClass;
-       break;
+        ArgRegClass = &SPU::R64FPRegClass;
+        break;
        case MVT::v2f64:
        case MVT::v4f32:
        case MVT::v2i64:
        case MVT::v4i32:
        case MVT::v8i16:
        case MVT::v16i8:
-       ArgRegClass = &SPU::VECREGRegClass;
-       break;
+        ArgRegClass = &SPU::VECREGRegClass;
+        break;
        }
  
        unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
@@ -1035,8 +1046,8 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
    ArgValues.push_back(Root);
  
    // Return the new list of results.
-  return DAG.getMergeValues(Op.getNode()->getVTList(), &ArgValues[0],
-                            ArgValues.size());
+  return DAG.getNode(ISD::MERGE_VALUES, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size());
  }
  
  /// isLSAAddress - Return the immediate to use if the specified
@@ -1116,6 +1127,8 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
          ArgOffset += StackSlotSize;
        }
        break;
+    case MVT::v2i64:
+    case MVT::v2f64:
      case MVT::v4f32:
      case MVT::v4i32:
      case MVT::v8i16:
@@ -1244,6 +1257,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      NumResults = 1;
      break;
    case MVT::v2f64:
+  case MVT::v2i64:
    case MVT::v4f32:
    case MVT::v4i32:
    case MVT::v8i16:
@@ -1591,7 +1605,7 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    case MVT::v2f64: {
      uint64_t f64val = SplatBits;
      assert(SplatSize == 8
-           && "LowerBUILD_VECTOR: 64-bit float vector element: unexpected size.");
+           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
      // NOTE: pretend the constant is an integer. LLVM won't load FP constants
      SDValue T = DAG.getConstant(f64val, MVT::i64);
      return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64,
@@ -1644,8 +1658,8 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
        // specialized masks to replace any and all 0's, 0xff's and 0x80's.
  
        // Detect if the upper or lower half is a special shuffle mask pattern:
-      upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
-      lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+      upper_special = (upper == 0||upper == 0xffffffff||upper == 0x80000000);
+      lower_special = (lower == 0||lower == 0xffffffff||lower == 0x80000000);
  
        // Create lower vector if not a special pattern
        if (!lower_special) {
@@ -1717,11 +1731,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  /// which the Cell can operate. The code inspects V3 to ascertain whether the
  /// permutation vector, V3, is monotonically increasing with one "exception"
  /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
-/// INSERT_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
+/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
  /// In either case, the net result is going to eventually invoke SHUFB to
  /// permute/shuffle the bytes from V1 and V2.
  /// \note
-/// INSERT_MASK is eventually selected as one of the C*D instructions, generate
+/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
  /// control word for byte/halfword/word insertion. This takes care of a single
  /// element move from V2 into V1.
  /// \note
@@ -1736,38 +1750,64 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    // If we have a single element being moved from V1 to V2, this can be handled
    // using the C*[DX] compute mask instructions, but the vector elements have
    // to be monotonically increasing with one exception element.
-  MVT EltVT = V1.getValueType().getVectorElementType();
+  MVT VecVT = V1.getValueType();
+  MVT EltVT = VecVT.getVectorElementType();
    unsigned EltsFromV2 = 0;
    unsigned V2Elt = 0;
    unsigned V2EltIdx0 = 0;
    unsigned CurrElt = 0;
+  unsigned MaxElts = VecVT.getVectorNumElements();
+  unsigned PrevElt = 0;
+  unsigned V0Elt = 0;
    bool monotonic = true;
-  if (EltVT == MVT::i8)
+  bool rotate = true;
+
+  if (EltVT == MVT::i8) {
      V2EltIdx0 = 16;
-  else if (EltVT == MVT::i16)
+  } else if (EltVT == MVT::i16) {
      V2EltIdx0 = 8;
-  else if (EltVT == MVT::i32)
+  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
      V2EltIdx0 = 4;
-  else
+  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
+    V2EltIdx0 = 2;
+  } else
      assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");
  
-  for (unsigned i = 0, e = PermMask.getNumOperands();
-       EltsFromV2 <= 1 && monotonic && i != e;
-       ++i) {
-    unsigned SrcElt;
-    if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
-      SrcElt = 0;
-    else
-      SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getZExtValue();
+  for (unsigned i = 0; i != PermMask.getNumOperands(); ++i) {
+    if (PermMask.getOperand(i).getOpcode() != ISD::UNDEF) {
+      unsigned SrcElt = cast<ConstantSDNode > (PermMask.getOperand(i))->getZExtValue();
  
-    if (SrcElt >= V2EltIdx0) {
-      ++EltsFromV2;
-      V2Elt = (V2EltIdx0 - SrcElt) << 2;
-    } else if (CurrElt != SrcElt) {
-      monotonic = false;
-    }
+      if (monotonic) {
+        if (SrcElt >= V2EltIdx0) {
+          if (1 >= (++EltsFromV2)) {
+            V2Elt = (V2EltIdx0 - SrcElt) << 2;
+          }
+        } else if (CurrElt != SrcElt) {
+          monotonic = false;
+        }
+
+        ++CurrElt;
+      }
  
-    ++CurrElt;
+      if (rotate) {
+        if (PrevElt > 0 && SrcElt < MaxElts) {
+          if ((PrevElt == SrcElt - 1)
+              || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
+            PrevElt = SrcElt;
+            if (SrcElt == 0)
+              V0Elt = i;
+          } else {
+            rotate = false;
+          }
+        } else if (PrevElt == 0) {
+          // First time through, need to keep track of previous element
+          PrevElt = SrcElt;
+        } else {
+          // This isn't a rotation, takes elements from vector 2
+          rotate = false;
+        }
+      }
+    }
    }
  
    if (EltsFromV2 == 1 && monotonic) {
@@ -1779,13 +1819,18 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      // Initialize temporary register to 0
      SDValue InitTempReg =
        DAG.getCopyToReg(DAG.getEntryNode(), VReg, DAG.getConstant(0, PtrVT));
-    // Copy register's contents as index in INSERT_MASK:
+    // Copy register's contents as index in SHUFFLE_MASK:
      SDValue ShufMaskOp =
-      DAG.getNode(SPUISD::INSERT_MASK, V1.getValueType(),
+      DAG.getNode(SPUISD::SHUFFLE_MASK, MVT::v4i32,
                    DAG.getTargetConstant(V2Elt, MVT::i32),
                    DAG.getCopyFromReg(InitTempReg, VReg, PtrVT));
      // Use shuffle mask in SHUFB synthetic instruction:
      return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V2, V1, ShufMaskOp);
+  } else if (rotate) {
+    int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8;
+    
+    return DAG.getNode(SPUISD::ROTBYTES_LEFT, V1.getValueType(),
+                       V1, DAG.getConstant(rotamt, MVT::i16));
    } else {
     // Convert the SHUFFLE_VECTOR mask's input element units to the
     // actual bytes.
@@ -1806,7 +1851,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      }
  
      SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
-                                      &ResultMask[0], ResultMask.size());
+                                    &ResultMask[0], ResultMask.size());
      return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V1, V2, VPermMask);
    }
  }
@@ -2047,83 +2092,161 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    MVT VT = Op.getValueType();
    SDValue N = Op.getOperand(0);
    SDValue Elt = Op.getOperand(1);
-  SDValue ShufMask[16];
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt);
+  SDValue retval;
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+    // Constant argument:
+    int EltNo = (int) C->getZExtValue();
+
+    // sanity checks:
+    if (VT == MVT::i8 && EltNo >= 16)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+    else if (VT == MVT::i16 && EltNo >= 8)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+    else if (VT == MVT::i32 && EltNo >= 4)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+    else if (VT == MVT::i64 && EltNo >= 2)
+      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+
+    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
+      // i32 and i64: Element 0 is the preferred slot
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, N);
+    }
  
-  assert(C != 0 && "LowerEXTRACT_VECTOR_ELT expecting constant SDNode");
+    // Need to generate shuffle mask and extract:
+    int prefslot_begin = -1, prefslot_end = -1;
+    int elt_byte = EltNo * VT.getSizeInBits() / 8;
  
-  int EltNo = (int) C->getZExtValue();
+    switch (VT.getSimpleVT()) {
+    default:
+      assert(false && "Invalid value type!");
+    case MVT::i8: {
+      prefslot_begin = prefslot_end = 3;
+      break;
+    }
+    case MVT::i16: {
+      prefslot_begin = 2; prefslot_end = 3;
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      prefslot_begin = 0; prefslot_end = 3;
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      prefslot_begin = 0; prefslot_end = 7;
+      break;
+    }
+    }
  
-  // sanity checks:
-  if (VT == MVT::i8 && EltNo >= 16)
-    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
-  else if (VT == MVT::i16 && EltNo >= 8)
-    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
-  else if (VT == MVT::i32 && EltNo >= 4)
-    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
-  else if (VT == MVT::i64 && EltNo >= 2)
-    assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+    assert(prefslot_begin != -1 && prefslot_end != -1 &&
+           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
+
+    unsigned int ShufBytes[16];
+    for (int i = 0; i < 16; ++i) {
+      // zero fill uppper part of preferred slot, don't care about the
+      // other slots:
+      unsigned int mask_val;
+      if (i <= prefslot_end) {
+        mask_val =
+          ((i < prefslot_begin)
+           ? 0x80
+           : elt_byte + (i - prefslot_begin));
+
+        ShufBytes[i] = mask_val;
+      } else
+        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
+    }
  
-  if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
-    // i32 and i64: Element 0 is the preferred slot
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT, N);
-  }
+    SDValue ShufMask[4];
+    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
+      unsigned bidx = i * 4;
+      unsigned int bits = ((ShufBytes[bidx] << 24) |
+                           (ShufBytes[bidx+1] << 16) |
+                           (ShufBytes[bidx+2] << 8) |
+                           ShufBytes[bidx+3]);
+      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
+    }
  
-  // Need to generate shuffle mask and extract:
-  int prefslot_begin = -1, prefslot_end = -1;
-  int elt_byte = EltNo * VT.getSizeInBits() / 8;
+    SDValue ShufMaskVec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                      &ShufMask[0],
+                                      sizeof(ShufMask) / sizeof(ShufMask[0]));
  
-  switch (VT.getSimpleVT()) {
-  default:
-    assert(false && "Invalid value type!");
-  case MVT::i8: {
-    prefslot_begin = prefslot_end = 3;
-    break;
-  }
-  case MVT::i16: {
-    prefslot_begin = 2; prefslot_end = 3;
-    break;
-  }
-  case MVT::i32:
-  case MVT::f32: {
-    prefslot_begin = 0; prefslot_end = 3;
-    break;
-  }
-  case MVT::i64:
-  case MVT::f64: {
-    prefslot_begin = 0; prefslot_end = 7;
-    break;
-  }
-  }
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                         DAG.getNode(SPUISD::SHUFB, N.getValueType(),
+                                     N, N, ShufMaskVec));
+  } else {
+    // Variable index: Rotate the requested element into slot 0, then replicate
+    // slot 0 across the vector
+    MVT VecVT = N.getValueType();
+    if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
+      cerr << "LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit vector type!\n";
+      abort();
+    }
  
-  assert(prefslot_begin != -1 && prefslot_end != -1 &&
-         "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
+    // Make life easier by making sure the index is zero-extended to i32
+    if (Elt.getValueType() != MVT::i32)
+      Elt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Elt);
  
-  for (int i = 0; i < 16; ++i) {
-    // zero fill uppper part of preferred slot, don't care about the
-    // other slots:
-    unsigned int mask_val;
+    // Scale the index to a bit/byte shift quantity
+    APInt scaleFactor =
+            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
+    unsigned scaleShift = scaleFactor.logBase2();
+    SDValue vecShift;
  
-    if (i <= prefslot_end) {
-      mask_val =
-        ((i < prefslot_begin)
-         ? 0x80
-         : elt_byte + (i - prefslot_begin));
+    if (scaleShift > 0) {
+      // Scale the shift factor:
+      Elt = DAG.getNode(ISD::SHL, MVT::i32, Elt,
+                        DAG.getConstant(scaleShift, MVT::i32));
+    }
  
-      ShufMask[i] = DAG.getConstant(mask_val, MVT::i8);
-    } else
-      ShufMask[i] = ShufMask[i % (prefslot_end + 1)];
-  }
+    vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT, N, Elt);
  
-  SDValue ShufMaskVec =
-    DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
-                &ShufMask[0],
-                sizeof(ShufMask) / sizeof(ShufMask[0]));
+    // Replicate the bytes starting at byte 0 across the entire vector (for
+    // consistency with the notion of a unified register set)
+    SDValue replicate;
+
+    switch (VT.getSimpleVT()) {
+    default:
+      cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n";
+      abort();
+      /*NOTREACHED*/
+    case MVT::i8: {
+      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
+                              factor, factor);
+      break;
+    }
+    case MVT::i16: {
+      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
+                              factor, factor);
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
+                              factor, factor);
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
+      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, loFactor, hiFactor,
+                              loFactor, hiFactor);
+      break;
+    }
+    }
  
-  return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
-                     DAG.getNode(SPUISD::SHUFB, N.getValueType(),
-                                 N, N, ShufMaskVec));
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                         DAG.getNode(SPUISD::SHUFB, VecVT,
+                                     vecShift, vecShift, replicate));
+  }
  
+  return retval;
  }
  
  static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
@@ -2136,18 +2259,17 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
  
    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  // Use $2 because it's always 16-byte aligned and it's available:
-  SDValue PtrBase = DAG.getRegister(SPU::R2, PtrVT);
+  // Use $sp ($1) because it's always 16-byte aligned and it's available:
+  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(CN->getSExtValue(), PtrVT));
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, VT, Pointer);
  
    SDValue result =
      DAG.getNode(SPUISD::SHUFB, VT,
                  DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, ValOp),
-                VecOp,
-                DAG.getNode(SPUISD::INSERT_MASK, VT,
-                            DAG.getNode(ISD::ADD, PtrVT,
-                                        PtrBase,
-                                        DAG.getConstant(CN->getZExtValue(),
-                                                        PtrVT))));
+                VecOp, 
+               DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, ShufMask));
  
    return result;
  }
@@ -2271,19 +2393,45 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
  
      assert(Op0VT == MVT::i32
             && "CellSPU: Zero/sign extending something other than i32");
-    DEBUG(cerr << "CellSPU: LowerI64Math custom lowering zero/sign/any extend\n");
  
-    unsigned NewOpc = (Opc == ISD::SIGN_EXTEND
-                      ? SPUISD::ROTBYTES_RIGHT_S
-                      : SPUISD::ROTQUAD_RZ_BYTES);
-    SDValue PromoteScalar =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0);
+    DEBUG(cerr << "CellSPU.LowerI64Math: lowering zero/sign/any extend\n");
  
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
-                       DAG.getNode(ISD::BIT_CONVERT, VecVT,
-                                   DAG.getNode(NewOpc, Op0VecVT,
-                                               PromoteScalar,
-                                               DAG.getConstant(4, MVT::i32))));
+    SDValue PromoteScalar =
+            DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0);
+
+    if (Opc != ISD::SIGN_EXTEND) {
+      // Use a shuffle to zero extend the i32 to i64 directly:
+      SDValue shufMask =
+              DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT,
+                          DAG.getConstant(0x80808080, MVT::i32),
+                          DAG.getConstant(0x00010203, MVT::i32),
+                          DAG.getConstant(0x80808080, MVT::i32),
+                          DAG.getConstant(0x08090a0b, MVT::i32));
+      SDValue zextShuffle =
+              DAG.getNode(SPUISD::SHUFB, Op0VecVT,
+                          PromoteScalar, PromoteScalar, shufMask);
+
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                         DAG.getNode(ISD::BIT_CONVERT, VecVT, zextShuffle));
+    } else {
+      // SPU has no "rotate quadword and replicate bit 0" (i.e. rotate/shift
+      // right and propagate the sign bit) instruction.
+      SDValue RotQuad =
+              DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, Op0VecVT,
+                          PromoteScalar, DAG.getConstant(4, MVT::i32));
+      SDValue SignQuad =
+              DAG.getNode(SPUISD::VEC_SRA, Op0VecVT,
+                          PromoteScalar, DAG.getConstant(32, MVT::i32));
+      SDValue SelMask =
+              DAG.getNode(SPUISD::SELECT_MASK, Op0VecVT,
+                          DAG.getConstant(0xf0f0, MVT::i16));
+      SDValue CombineQuad =
+              DAG.getNode(SPUISD::SELB, Op0VecVT,
+                          SignQuad, RotQuad, SelMask);
+
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                         DAG.getNode(ISD::BIT_CONVERT, VecVT, CombineQuad));
+    }
    }
  
    case ISD::ADD: {
@@ -2310,7 +2458,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
                    DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
                                &ShufBytes[0], ShufBytes.size()));
  
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, MVT::i64,
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
                         DAG.getNode(SPUISD::ADD_EXTENDED, MVT::v2i64,
                                     Op0, Op1, ShiftedCarry));
    }
@@ -2339,7 +2487,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
                    DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
                                &ShufBytes[0], ShufBytes.size()));
  
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, MVT::i64,
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
                         DAG.getNode(SPUISD::SUB_EXTENDED, MVT::v2i64,
                                     Op0, Op1, ShiftedBorrow));
    }
@@ -2363,7 +2511,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
                    ShiftAmt,
                    DAG.getConstant(7, ShiftAmtVT));
  
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, VT,
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
                         DAG.getNode(SPUISD::SHLQUAD_L_BITS, VecVT,
                                     DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT,
                                                 MaskLower, ShiftAmtBytes),
@@ -2403,7 +2551,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
      }
  
      SDValue UpperHalfSign =
-      DAG.getNode(SPUISD::EXTRACT_ELT0, MVT::i32,
+      DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i32,
                    DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
                                DAG.getNode(SPUISD::VEC_SRA, MVT::v2i64,
                                            Op0, DAG.getConstant(31, MVT::i32))));
@@ -2422,7 +2570,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
        DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v2i64,
                    RotateLeftBytes, ShiftAmt);
  
-    return DAG.getNode(SPUISD::EXTRACT_ELT0, MVT::i64,
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
                         RotateLeftBits);
    }
    }
@@ -2612,8 +2760,135 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
    return SDValue();
  }
  
-/// LowerOperation - Provide custom lowering hooks for some operations.
-///
+//! Lower ISD::SELECT_CC
+/*!
+  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
+  SELB instruction.
+
+  \note Need to revisit this in the future: if the code path through the true
+  and false value computations is longer than the latency of a branch (6
+  cycles), then it would be more advantageous to branch and insert a new basic
+  block and branch on the condition. However, this code does not make that
+  assumption, given the simplisitc uses so far.
+ */
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  SDValue trueval = Op.getOperand(2);
+  SDValue falseval = Op.getOperand(3);
+  SDValue condition = Op.getOperand(4);
+
+  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
+  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
+  // with another "cannot select select_cc" assert:
+
+  SDValue compare = DAG.getNode(ISD::SETCC, VT, lhs, rhs, condition);
+  return DAG.getNode(SPUISD::SELB, VT, trueval, falseval, compare);
+}
+
+//! Custom lower ISD::TRUNCATE
+static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
+{
+  MVT VT = Op.getValueType();
+  MVT::SimpleValueType simpleVT = VT.getSimpleVT();
+  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+  MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
+
+  SDValue PromoteScalar = DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0);
+
+  unsigned maskLow;
+  unsigned maskHigh;
+
+  // Create shuffle mask
+  switch (Op0VT.getSimpleVT()) {
+  case MVT::i128:
+    switch (simpleVT) {
+    case MVT::i64:
+      // least significant doubleword of quadword
+      maskHigh = 0x08090a0b;
+      maskLow = 0x0c0d0e0f;
+      break;
+    case MVT::i32:
+      // least significant word of quadword
+      maskHigh = maskLow = 0x0c0d0e0f;
+      break;
+    case MVT::i16:
+      // least significant halfword of quadword
+      maskHigh = maskLow = 0x0e0f0e0f;
+      break;
+    case MVT::i8:
+      // least significant byte of quadword
+      maskHigh = maskLow = 0x0f0f0f0f;
+      break;
+    default:
+      cerr << "Truncation to illegal type!";
+      abort();
+    }
+    break;
+  case MVT::i64:
+    switch (simpleVT) {
+    case MVT::i32:
+      // least significant word of doubleword
+      maskHigh = maskLow = 0x04050607;
+      break;
+    case MVT::i16:
+      // least significant halfword of doubleword
+      maskHigh = maskLow = 0x06070607;
+      break;
+    case MVT::i8:
+      // least significant byte of doubleword
+      maskHigh = maskLow = 0x07070707;
+      break;
+    default:
+      cerr << "Truncation to illegal type!";
+      abort();
+    }
+    break;
+  case MVT::i32:
+  case MVT::i16:
+    switch (simpleVT) {
+    case MVT::i16:
+      // least significant halfword of word
+      maskHigh = maskLow = 0x02030203;
+      break;
+    case MVT::i8:
+      // least significant byte of word/halfword
+      maskHigh = maskLow = 0x03030303;
+      break;
+    default:
+      cerr << "Truncation to illegal type!";
+      abort();
+    }
+    break;
+  default:
+    cerr << "Trying to lower truncation from illegal type!";
+    abort();
+  }
+
+  // Use a shuffle to perform the truncation
+  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                 DAG.getConstant(maskHigh, MVT::i32),
+                                 DAG.getConstant(maskLow, MVT::i32),
+                                 DAG.getConstant(maskHigh, MVT::i32),
+                                 DAG.getConstant(maskLow, MVT::i32));
+
+  SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT,
+                                     PromoteScalar, PromoteScalar, shufMask);
+
+  return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                     DAG.getNode(ISD::BIT_CONVERT, VecVT, truncShuffle));
+}
+
+//! Custom (target-specific) lowering entry point
+/*!
+  This is where LLVM's DAG selection process calls to do target-specific
+  lowering of nodes.
+ */
  SDValue
  SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  {
@@ -2629,6 +2904,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
      abort();
    }
    case ISD::LOAD:
+  case ISD::EXTLOAD:
    case ISD::SEXTLOAD:
    case ISD::ZEXTLOAD:
      return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
@@ -2702,19 +2978,30 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
    case ISD::FDIV:
      if (VT == MVT::f32 || VT == MVT::v4f32)
        return LowerFDIVf32(Op, DAG);
-//    else if (Op.getValueType() == MVT::f64)
-//      return LowerFDIVf64(Op, DAG);
+#if 0
+    // This is probably a libcall
+    else if (Op.getValueType() == MVT::f64)
+      return LowerFDIVf64(Op, DAG);
+#endif
      else
        assert(0 && "Calling FDIV on unsupported MVT");
  
    case ISD::CTPOP:
      return LowerCTPOP(Op, DAG);
+
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
    }
  
    return SDValue();
  }
  
-SDNode *SPUTargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG)
+void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG)
  {
  #if 0
    unsigned Opc = (unsigned) N->getOpcode();
@@ -2733,7 +3020,6 @@ SDNode *SPUTargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG)
  #endif
  
    /* Otherwise, return unchanged */
-  return 0;
  }
  
  //===----------------------------------------------------------------------===//
@@ -2748,8 +3034,10 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
  #endif
    const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
    SelectionDAG &DAG = DCI.DAG;
-  SDValue Op0 = N->getOperand(0);      // everything has at least one operand
-  SDValue Result;                     // Initially, NULL result
+  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
+  MVT NodeVT = N->getValueType(0);      // The node's value type
+  MVT Op0VT = Op0.getValueType();      // The first operand's result
+  SDValue Result;                       // Initially, empty result
  
    switch (N->getOpcode()) {
    default: break;
@@ -2765,14 +3053,19 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
          ConstantSDNode *CN0 = cast<ConstantSDNode>(Op1);
          ConstantSDNode *CN1 = cast<ConstantSDNode>(Op01);
          SDValue combinedConst =
-          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(),
-                          Op0.getValueType());
+          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(), Op0VT);
+
+#if !defined(NDEBUG)
+        if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (add " << CN0->getZExtValue() << ", "
+                 << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n"
+                 << "With:    (SPUindirect <arg>, "
+                 << CN0->getZExtValue() + CN1->getZExtValue() << ")\n";
+        }
+#endif
  
-        DEBUG(cerr << "Replace: (add " << CN0->getZExtValue() << ", "
-                   << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n");
-        DEBUG(cerr << "With:    (SPUindirect <arg>, "
-                   << CN0->getZExtValue() + CN1->getZExtValue() << ")\n");
-        return DAG.getNode(SPUISD::IndirectAddr, Op0.getValueType(),
+        return DAG.getNode(SPUISD::IndirectAddr, Op0VT,
                             Op0.getOperand(0), combinedConst);
        }
      } else if (isa<ConstantSDNode>(Op0)
@@ -2785,8 +3078,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
          ConstantSDNode *CN0 = cast<ConstantSDNode>(Op0);
          ConstantSDNode *CN1 = cast<ConstantSDNode>(Op11);
          SDValue combinedConst =
-          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(),
-                          Op0.getValueType());
+          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(), Op0VT);
  
          DEBUG(cerr << "Replace: (add " << CN0->getZExtValue() << ", "
                     << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n");
@@ -2802,16 +3094,19 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    case ISD::SIGN_EXTEND:
    case ISD::ZERO_EXTEND:
    case ISD::ANY_EXTEND: {
-    if (Op0.getOpcode() == SPUISD::EXTRACT_ELT0 &&
-        N->getValueType(0) == Op0.getValueType()) {
+    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
        // (any_extend (SPUextract_elt0 <arg>)) ->
        // (SPUextract_elt0 <arg>)
        // Types must match, however...
-      DEBUG(cerr << "Replace: ");
-      DEBUG(N->dump(&DAG));
-      DEBUG(cerr << "\nWith:    ");
-      DEBUG(Op0.getNode()->dump(&DAG));
-      DEBUG(cerr << "\n");
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        cerr << "\nReplace: ";
+        N->dump(&DAG);
+        cerr << "\nWith:    ";
+        Op0.getNode()->dump(&DAG);
+        cerr << "\n";
+      }
+#endif
  
        return Op0;
      }
@@ -2847,7 +3142,6 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
      if (isa<ConstantSDNode>(Op1)) {
        // Kill degenerate vector shifts:
        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
-
        if (CN->getZExtValue() == 0) {
          Result = Op0;
        }
@@ -2861,20 +3155,20 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
      case ISD::ANY_EXTEND:
      case ISD::ZERO_EXTEND:
      case ISD::SIGN_EXTEND: {
-      // (SPUpromote_scalar (any|sign|zero_extend (SPUextract_elt0 <arg>))) ->
+      // (SPUpromote_scalar (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
        // <arg>
        // but only if the SPUpromote_scalar and <arg> types match.
        SDValue Op00 = Op0.getOperand(0);
-      if (Op00.getOpcode() == SPUISD::EXTRACT_ELT0) {
+      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
          SDValue Op000 = Op00.getOperand(0);
-        if (Op000.getValueType() == N->getValueType(0)) {
+        if (Op000.getValueType() == NodeVT) {
            Result = Op000;
          }
        }
        break;
      }
-    case SPUISD::EXTRACT_ELT0: {
-      // (SPUpromote_scalar (SPUextract_elt0 <arg>)) ->
+    case SPUISD::VEC2PREFSLOT: {
+      // (SPUpromote_scalar (SPUvec2prefslot <arg>)) ->
        // <arg>
        Result = Op0.getOperand(0);
        break;
@@ -2884,7 +3178,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    }
    }
    // Otherwise, return unchanged.
-#if 1
+#ifndef NDEBUG
    if (Result.getNode()) {
      DEBUG(cerr << "\nReplace.SPU: ");
      DEBUG(N->dump(&DAG));
@@ -2965,7 +3259,7 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
  #if 0
    case CALL:
    case SHUFB:
-  case INSERT_MASK:
+  case SHUFFLE_MASK:
    case CNTB:
  #endif
  
@@ -2980,8 +3274,7 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
    }
  
    case SPUISD::LDRESULT:
-  case SPUISD::EXTRACT_ELT0:
-  case SPUISD::EXTRACT_ELT0_CHAINED: {
+  case SPUISD::VEC2PREFSLOT: {
      MVT OpVT = Op.getValueType();
      unsigned OpVTBits = OpVT.getSizeInBits();
      uint64_t InMask = OpVT.getIntegerVTBitMask();
@@ -2991,10 +3284,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
    }
  
  #if 0
-  case EXTRACT_I1_ZEXT:
-  case EXTRACT_I1_SEXT:
-  case EXTRACT_I8_ZEXT:
-  case EXTRACT_I8_SEXT:
    case MPY:
    case MPYU:
    case MPYH:
@@ -3008,9 +3297,7 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
    case SPUISD::VEC_ROTR:
    case SPUISD::ROTQUAD_RZ_BYTES:
    case SPUISD::ROTQUAD_RZ_BITS:
-  case SPUISD::ROTBYTES_RIGHT_S:
    case SPUISD::ROTBYTES_LEFT:
-  case SPUISD::ROTBYTES_LEFT_CHAINED:
    case SPUISD::SELECT_MASK:
    case SPUISD::SELB:
    case SPUISD::FPInterp: