lib/Target/CellSPU/SPUISelLowering.cpp

   1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the SPUTargetLowering class.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "SPURegisterNames.h"
  15 #include "SPUISelLowering.h"
  16 #include "SPUTargetMachine.h"
  17 #include "SPUFrameInfo.h"
  18 #include "llvm/ADT/APInt.h"
  19 #include "llvm/ADT/VectorExtras.h"
  20 #include "llvm/CodeGen/CallingConvLower.h"
  21 #include "llvm/CodeGen/MachineFrameInfo.h"
  22 #include "llvm/CodeGen/MachineFunction.h"
  23 #include "llvm/CodeGen/MachineInstrBuilder.h"
  24 #include "llvm/CodeGen/MachineRegisterInfo.h"
  25 #include "llvm/CodeGen/SelectionDAG.h"
  26 #include "llvm/Constants.h"
  27 #include "llvm/Function.h"
  28 #include "llvm/Intrinsics.h"
  29 #include "llvm/Support/Debug.h"
  30 #include "llvm/Support/MathExtras.h"
  31 #include "llvm/Target/TargetOptions.h"
  32
  33 #include <map>
  34
  35 using namespace llvm;
  36
  37 // Used in getTargetNodeName() below
  38 namespace {
  39   std::map<unsigned, const char *> node_names;
  40
  41   //! MVT mapping to useful data for Cell SPU
  42   struct valtype_map_s {
  43     const MVT   valtype;
  44     const int   prefslot_byte;
  45   };
  46
  47   const valtype_map_s valtype_map[] = {
  48     { MVT::i1,   3 },
  49     { MVT::i8,   3 },
  50     { MVT::i16,  2 },
  51     { MVT::i32,  0 },
  52     { MVT::f32,  0 },
  53     { MVT::i64,  0 },
  54     { MVT::f64,  0 },
  55     { MVT::i128, 0 }
  56   };
  57
  58   const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
  59
  60   const valtype_map_s *getValueTypeMapEntry(MVT VT) {
  61     const valtype_map_s *retval = 0;
  62
  63     for (size_t i = 0; i < n_valtype_map; ++i) {
  64       if (valtype_map[i].valtype == VT) {
  65         retval = valtype_map + i;
  66         break;
  67       }
  68     }
  69
  70 #ifndef NDEBUG
  71     if (retval == 0) {
  72       cerr << "getValueTypeMapEntry returns NULL for "
  73            << VT.getMVTString()
  74            << "\n";
  75       abort();
  76     }
  77 #endif
  78
  79     return retval;
  80   }
  81
  82 }
  83
  84 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  85   : TargetLowering(TM),
  86     SPUTM(TM)
  87 {
  88   // Fold away setcc operations if possible.
  89   setPow2DivIsCheap();
  90
  91   // Use _setjmp/_longjmp instead of setjmp/longjmp.
  92   setUseUnderscoreSetJmp(true);
  93   setUseUnderscoreLongJmp(true);
  94
  95   // Set RTLIB libcall names as used by SPU:
  96   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
  97
  98   // Set up the SPU's register classes:
  99   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
 100   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
 101   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
 102   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
 103   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
 104   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
 105   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
 106
 107   // SPU has no sign or zero extended loads for i1, i8, i16:
 108   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
 109   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 110   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
 111
 112   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
 113   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
 114
 115   // SPU constant load actions are custom lowered:
 116   setOperationAction(ISD::Constant,   MVT::i64, Custom);
 117   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 118   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 119
 120   // SPU's loads and stores have to be custom lowered:
 121   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
 122        ++sctype) {
 123     MVT VT = (MVT::SimpleValueType)sctype;
 124
 125     setOperationAction(ISD::LOAD,   VT, Custom);
 126     setOperationAction(ISD::STORE,  VT, Custom);
 127     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
 128     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
 129     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
 130
 131     // SMUL_LOHI, UMUL_LOHI are not legal for Cell:
 132     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 133     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 134
 135     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
 136       MVT StoreVT = (MVT::SimpleValueType) stype;
 137       setTruncStoreAction(VT, StoreVT, Expand);
 138     }
 139   }
 140
 141   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
 142        ++sctype) {
 143     MVT VT = (MVT::SimpleValueType) sctype;
 144
 145     setOperationAction(ISD::LOAD,   VT, Custom);
 146     setOperationAction(ISD::STORE,  VT, Custom);
 147
 148     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
 149       MVT StoreVT = (MVT::SimpleValueType) stype;
 150       setTruncStoreAction(VT, StoreVT, Expand);
 151     }
 152   }
 153
 154   // Expand the jumptable branches
 155   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
 156   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
 157
 158   // Custom lower SELECT_CC for most cases, but expand by default
 159   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
 160   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
 161   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
 162   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
 163   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
 164
 165   // SPU has no intrinsics for these particular operations:
 166   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 167
 168   // SPU has no SREM/UREM instructions
 169   setOperationAction(ISD::SREM, MVT::i32, Expand);
 170   setOperationAction(ISD::UREM, MVT::i32, Expand);
 171   setOperationAction(ISD::SREM, MVT::i64, Expand);
 172   setOperationAction(ISD::UREM, MVT::i64, Expand);
 173
 174   // We don't support sin/cos/sqrt/fmod
 175   setOperationAction(ISD::FSIN , MVT::f64, Expand);
 176   setOperationAction(ISD::FCOS , MVT::f64, Expand);
 177   setOperationAction(ISD::FREM , MVT::f64, Expand);
 178   setOperationAction(ISD::FSIN , MVT::f32, Expand);
 179   setOperationAction(ISD::FCOS , MVT::f32, Expand);
 180   setOperationAction(ISD::FREM , MVT::f32, Expand);
 181
 182   // If we're enabling GP optimizations, use hardware square root
 183   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 184   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 185
 186   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 187   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 188
 189   // Make sure that DAGCombine doesn't insert illegal 64-bit constants
 190   setOperationAction(ISD::FABS,  MVT::f64, Custom);
 191
 192   // SPU can do rotate right and left, so legalize it... but customize for i8
 193   // because instructions don't exist.
 194
 195   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
 196   //        .td files.
 197   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
 198   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
 199   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
 200
 201   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
 202   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
 203   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
 204
 205   // SPU has no native version of shift left/right for i8
 206   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
 207   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
 208   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
 209
 210   // Make these operations legal and handle them during instruction selection:
 211   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
 212   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
 213   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
 214
 215   // Custom lower i8, i32 and i64 multiplications
 216   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
 217   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
 218   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
 219
 220   // Need to custom handle (some) common i8, i64 math ops
 221   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
 222   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
 223   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
 224   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
 225
 226   // SPU does not have BSWAP. It does have i32 support CTLZ.
 227   // CTPOP has to be custom lowered.
 228   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
 229   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
 230
 231   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
 232   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
 233   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
 234   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
 235
 236   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
 237   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
 238
 239   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
 240
 241   // SPU has a version of select that implements (a&~c)|(b&c), just like
 242   // select ought to work:
 243   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
 244   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
 245   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
 246   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
 247
 248   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
 249   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
 250   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
 251   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
 252   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
 253
 254   // Custom lower i128 -> i64 truncates
 255   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
 256
 257   // SPU has a legal FP -> signed INT instruction
 258   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
 259   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 260   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
 261   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 262
 263   // FDIV on SPU requires custom lowering
 264   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // libcall
 265
 266   // SPU has [U|S]INT_TO_FP
 267   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
 268   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 269   setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
 270   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
 271   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
 272   setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
 273   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 274   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 275
 276   setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal);
 277   setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal);
 278   setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal);
 279   setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal);
 280
 281   // We cannot sextinreg(i1).  Expand to shifts.
 282   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 283
 284   // Support label based line numbers.
 285   setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
 286   setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
 287
 288   // We want to legalize GlobalAddress and ConstantPool nodes into the
 289   // appropriate instructions to materialize the address.
 290   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
 291        ++sctype) {
 292     MVT VT = (MVT::SimpleValueType)sctype;
 293
 294     setOperationAction(ISD::GlobalAddress,  VT, Custom);
 295     setOperationAction(ISD::ConstantPool,   VT, Custom);
 296     setOperationAction(ISD::JumpTable,      VT, Custom);
 297   }
 298
 299   // RET must be custom lowered, to meet ABI requirements
 300   setOperationAction(ISD::RET,           MVT::Other, Custom);
 301
 302   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 303   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 304
 305   // Use the default implementation.
 306   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
 307   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
 308   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 309   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
 310   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
 311   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
 312   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
 313
 314   // Cell SPU has instructions for converting between i64 and fp.
 315   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 316   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 317
 318   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
 319   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
 320
 321   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
 322   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 323
 324   // First set operation action for all vector types to expand. Then we
 325   // will selectively turn on ones that can be effectively codegen'd.
 326   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
 327   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
 328   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
 329   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
 330   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
 331   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
 332
 333   // "Odd size" vector classes that we're willing to support:
 334   addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass);
 335
 336   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 337        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
 338     MVT VT = (MVT::SimpleValueType)i;
 339
 340     // add/sub are legal for all supported vector VT's.
 341     setOperationAction(ISD::ADD , VT, Legal);
 342     setOperationAction(ISD::SUB , VT, Legal);
 343     // mul has to be custom lowered.
 344     // TODO: v2i64 vector multiply
 345     setOperationAction(ISD::MUL , VT, Legal);
 346
 347     setOperationAction(ISD::AND   , VT, Legal);
 348     setOperationAction(ISD::OR    , VT, Legal);
 349     setOperationAction(ISD::XOR   , VT, Legal);
 350     setOperationAction(ISD::LOAD  , VT, Legal);
 351     setOperationAction(ISD::SELECT, VT, Legal);
 352     setOperationAction(ISD::STORE,  VT, Legal);
 353
 354     // These operations need to be expanded:
 355     setOperationAction(ISD::SDIV, VT, Expand);
 356     setOperationAction(ISD::SREM, VT, Expand);
 357     setOperationAction(ISD::UDIV, VT, Expand);
 358     setOperationAction(ISD::UREM, VT, Expand);
 359
 360     // Custom lower build_vector, constant pool spills, insert and
 361     // extract vector elements:
 362     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 363     setOperationAction(ISD::ConstantPool, VT, Custom);
 364     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
 365     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 366     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 367     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 368   }
 369
 370   setOperationAction(ISD::AND, MVT::v16i8, Custom);
 371   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
 372   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
 373   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 374
 375   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 376
 377   setShiftAmountType(MVT::i32);
 378   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 379
 380   setStackPointerRegisterToSaveRestore(SPU::R1);
 381
 382   // We have target-specific dag combine patterns for the following nodes:
 383   setTargetDAGCombine(ISD::ADD);
 384   setTargetDAGCombine(ISD::ZERO_EXTEND);
 385   setTargetDAGCombine(ISD::SIGN_EXTEND);
 386   setTargetDAGCombine(ISD::ANY_EXTEND);
 387
 388   computeRegisterProperties();
 389
 390   // Set pre-RA register scheduler default to BURR, which produces slightly
 391   // better code than the default (could also be TDRR, but TargetLowering.h
 392   // needs a mod to support that model):
 393   setSchedulingPreference(SchedulingForRegPressure);
 394 }
 395
 396 const char *
 397 SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
 398 {
 399   if (node_names.empty()) {
 400     node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
 401     node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
 402     node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
 403     node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
 404     node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
 405     node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
 406     node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
 407     node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
 408     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
 409     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
 410     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
 411     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
 412     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
 413     node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
 414     node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
 415     node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
 416     node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
 417     node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
 418     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
 419     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
 420     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
 421     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
 422             "SPUISD::ROTBYTES_LEFT_BITS";
 423     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
 424     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
 425     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
 426     node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
 427     node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
 428   }
 429
 430   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
 431
 432   return ((i != node_names.end()) ? i->second : 0);
 433 }
 434
 435 //===----------------------------------------------------------------------===//
 436 // Return the Cell SPU's SETCC result type
 437 //===----------------------------------------------------------------------===//
 438
 439 MVT SPUTargetLowering::getSetCCResultType(MVT VT) const {
 440   // i16 and i32 are valid SETCC result types
 441   return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? VT : MVT::i32);
 442 }
 443
 444 //===----------------------------------------------------------------------===//
 445 // Calling convention code:
 446 //===----------------------------------------------------------------------===//
 447
 448 #include "SPUGenCallingConv.inc"
 449
 450 //===----------------------------------------------------------------------===//
 451 //  LowerOperation implementation
 452 //===----------------------------------------------------------------------===//
 453
 454 /// Custom lower loads for CellSPU
 455 /*!
 456  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 457  within a 16-byte block, we have to rotate to extract the requested element.
 458
 459  For extending loads, we also want to ensure that the following sequence is
 460  emitted, e.g. for MVT::f32 extending load to MVT::f64:
 461
 462 \verbatim
 463 %1  v16i8,ch = load
 464 %2  v16i8,ch = rotate %1
 465 %3  v4f8, ch = bitconvert %2
 466 %4  f32      = vec2perfslot %3
 467 %5  f64      = fp_extend %4
 468 \endverbatim
 469 */
 470 static SDValue
 471 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 472   LoadSDNode *LN = cast<LoadSDNode>(Op);
 473   SDValue the_chain = LN->getChain();
 474   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 475   MVT InVT = LN->getMemoryVT();
 476   MVT OutVT = Op.getValueType();
 477   ISD::LoadExtType ExtType = LN->getExtensionType();
 478   unsigned alignment = LN->getAlignment();
 479   const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
 480
 481   switch (LN->getAddressingMode()) {
 482   case ISD::UNINDEXED: {
 483     SDValue result;
 484     SDValue basePtr = LN->getBasePtr();
 485     SDValue rotate;
 486
 487     if (alignment == 16) {
 488       ConstantSDNode *CN;
 489
 490       // Special cases for a known aligned load to simplify the base pointer
 491       // and the rotation amount:
 492       if (basePtr.getOpcode() == ISD::ADD
 493           && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
 494         // Known offset into basePtr
 495         int64_t offset = CN->getSExtValue();
 496         int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte);
 497
 498         if (rotamt < 0)
 499           rotamt += 16;
 500
 501         rotate = DAG.getConstant(rotamt, MVT::i16);
 502
 503         // Simplify the base pointer for this case:
 504         basePtr = basePtr.getOperand(0);
 505         if ((offset & ~0xf) > 0) {
 506           basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
 507                                 basePtr,
 508                                 DAG.getConstant((offset & ~0xf), PtrVT));
 509         }
 510       } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
 511                  || (basePtr.getOpcode() == SPUISD::IndirectAddr
 512                      && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
 513                      && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
 514         // Plain aligned a-form address: rotate into preferred slot
 515         // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
 516         int64_t rotamt = -vtm->prefslot_byte;
 517         if (rotamt < 0)
 518           rotamt += 16;
 519         rotate = DAG.getConstant(rotamt, MVT::i16);
 520       } else {
 521         // Offset the rotate amount by the basePtr and the preferred slot
 522         // byte offset
 523         int64_t rotamt = -vtm->prefslot_byte;
 524         if (rotamt < 0)
 525           rotamt += 16;
 526         rotate = DAG.getNode(ISD::ADD, PtrVT,
 527                              basePtr,
 528                              DAG.getConstant(rotamt, PtrVT));
 529       }
 530     } else {
 531       // Unaligned load: must be more pessimistic about addressing modes:
 532       if (basePtr.getOpcode() == ISD::ADD) {
 533         MachineFunction &MF = DAG.getMachineFunction();
 534         MachineRegisterInfo &RegInfo = MF.getRegInfo();
 535         unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 536         SDValue Flag;
 537
 538         SDValue Op0 = basePtr.getOperand(0);
 539         SDValue Op1 = basePtr.getOperand(1);
 540
 541         if (isa<ConstantSDNode>(Op1)) {
 542           // Convert the (add <ptr>, <const>) to an indirect address contained
 543           // in a register. Note that this is done because we need to avoid
 544           // creating a 0(reg) d-form address due to the SPU's block loads.
 545           basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
 546           the_chain = DAG.getCopyToReg(the_chain, VReg, basePtr, Flag);
 547           basePtr = DAG.getCopyFromReg(the_chain, VReg, PtrVT);
 548         } else {
 549           // Convert the (add <arg1>, <arg2>) to an indirect address, which
 550           // will likely be lowered as a reg(reg) x-form address.
 551           basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
 552         }
 553       } else {
 554         basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
 555                               basePtr,
 556                               DAG.getConstant(0, PtrVT));
 557       }
 558
 559       // Offset the rotate amount by the basePtr and the preferred slot
 560       // byte offset
 561       rotate = DAG.getNode(ISD::ADD, PtrVT,
 562                            basePtr,
 563                            DAG.getConstant(-vtm->prefslot_byte, PtrVT));
 564     }
 565
 566     // Re-emit as a v16i8 vector load
 567     result = DAG.getLoad(MVT::v16i8, the_chain, basePtr,
 568                          LN->getSrcValue(), LN->getSrcValueOffset(),
 569                          LN->isVolatile(), 16);
 570
 571     // Update the chain
 572     the_chain = result.getValue(1);
 573
 574     // Rotate into the preferred slot:
 575     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v16i8,
 576                          result.getValue(0), rotate);
 577
 578     // Convert the loaded v16i8 vector to the appropriate vector type
 579     // specified by the operand:
 580     MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
 581     result = DAG.getNode(SPUISD::VEC2PREFSLOT, InVT,
 582                          DAG.getNode(ISD::BIT_CONVERT, vecVT, result));
 583
 584     // Handle extending loads by extending the scalar result:
 585     if (ExtType == ISD::SEXTLOAD) {
 586       result = DAG.getNode(ISD::SIGN_EXTEND, OutVT, result);
 587     } else if (ExtType == ISD::ZEXTLOAD) {
 588       result = DAG.getNode(ISD::ZERO_EXTEND, OutVT, result);
 589     } else if (ExtType == ISD::EXTLOAD) {
 590       unsigned NewOpc = ISD::ANY_EXTEND;
 591
 592       if (OutVT.isFloatingPoint())
 593         NewOpc = ISD::FP_EXTEND;
 594
 595       result = DAG.getNode(NewOpc, OutVT, result);
 596     }
 597
 598     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
 599     SDValue retops[2] = {
 600       result,
 601       the_chain
 602     };
 603
 604     result = DAG.getNode(SPUISD::LDRESULT, retvts,
 605                          retops, sizeof(retops) / sizeof(retops[0]));
 606     return result;
 607   }
 608   case ISD::PRE_INC:
 609   case ISD::PRE_DEC:
 610   case ISD::POST_INC:
 611   case ISD::POST_DEC:
 612   case ISD::LAST_INDEXED_MODE:
 613     cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
 614             "UNINDEXED\n";
 615     cerr << (unsigned) LN->getAddressingMode() << "\n";
 616     abort();
 617     /*NOTREACHED*/
 618   }
 619
 620   return SDValue();
 621 }
 622
 623 /// Custom lower stores for CellSPU
 624 /*!
 625  All CellSPU stores are aligned to 16-byte boundaries, so for elements
 626  within a 16-byte block, we have to generate a shuffle to insert the
 627  requested element into its place, then store the resulting block.
 628  */
 629 static SDValue
 630 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 631   StoreSDNode *SN = cast<StoreSDNode>(Op);
 632   SDValue Value = SN->getValue();
 633   MVT VT = Value.getValueType();
 634   MVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
 635   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 636   unsigned alignment = SN->getAlignment();
 637
 638   switch (SN->getAddressingMode()) {
 639   case ISD::UNINDEXED: {
 640     // The vector type we really want to load from the 16-byte chunk.
 641     MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
 642         stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
 643
 644     SDValue alignLoadVec;
 645     SDValue basePtr = SN->getBasePtr();
 646     SDValue the_chain = SN->getChain();
 647     SDValue insertEltOffs;
 648
 649     if (alignment == 16) {
 650       ConstantSDNode *CN;
 651
 652       // Special cases for a known aligned load to simplify the base pointer
 653       // and insertion byte:
 654       if (basePtr.getOpcode() == ISD::ADD
 655           && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
 656         // Known offset into basePtr
 657         int64_t offset = CN->getSExtValue();
 658
 659         // Simplify the base pointer for this case:
 660         basePtr = basePtr.getOperand(0);
 661         insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
 662                                     basePtr,
 663                                     DAG.getConstant((offset & 0xf), PtrVT));
 664
 665         if ((offset & ~0xf) > 0) {
 666           basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
 667                                 basePtr,
 668                                 DAG.getConstant((offset & ~0xf), PtrVT));
 669         }
 670       } else {
 671         // Otherwise, assume it's at byte 0 of basePtr
 672         insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
 673                                     basePtr,
 674                                     DAG.getConstant(0, PtrVT));
 675       }
 676     } else {
 677       // Unaligned load: must be more pessimistic about addressing modes:
 678       if (basePtr.getOpcode() == ISD::ADD) {
 679         MachineFunction &MF = DAG.getMachineFunction();
 680         MachineRegisterInfo &RegInfo = MF.getRegInfo();
 681         unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 682         SDValue Flag;
 683
 684         SDValue Op0 = basePtr.getOperand(0);
 685         SDValue Op1 = basePtr.getOperand(1);
 686
 687         if (isa<ConstantSDNode>(Op1)) {
 688           // Convert the (add <ptr>, <const>) to an indirect address contained
 689           // in a register. Note that this is done because we need to avoid
 690           // creating a 0(reg) d-form address due to the SPU's block loads.
 691           basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
 692           the_chain = DAG.getCopyToReg(the_chain, VReg, basePtr, Flag);
 693           basePtr = DAG.getCopyFromReg(the_chain, VReg, PtrVT);
 694         } else {
 695           // Convert the (add <arg1>, <arg2>) to an indirect address, which
 696           // will likely be lowered as a reg(reg) x-form address.
 697           basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
 698         }
 699       } else {
 700         basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
 701                               basePtr,
 702                               DAG.getConstant(0, PtrVT));
 703       }
 704
 705       // Insertion point is solely determined by basePtr's contents
 706       insertEltOffs = DAG.getNode(ISD::ADD, PtrVT,
 707                                   basePtr,
 708                                   DAG.getConstant(0, PtrVT));
 709     }
 710
 711     // Re-emit as a v16i8 vector load
 712     alignLoadVec = DAG.getLoad(MVT::v16i8, the_chain, basePtr,
 713                                SN->getSrcValue(), SN->getSrcValueOffset(),
 714                                SN->isVolatile(), 16);
 715
 716     // Update the chain
 717     the_chain = alignLoadVec.getValue(1);
 718
 719     LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
 720     SDValue theValue = SN->getValue();
 721     SDValue result;
 722
 723     if (StVT != VT
 724         && (theValue.getOpcode() == ISD::AssertZext
 725             || theValue.getOpcode() == ISD::AssertSext)) {
 726       // Drill down and get the value for zero- and sign-extended
 727       // quantities
 728       theValue = theValue.getOperand(0);
 729     }
 730
 731     // If the base pointer is already a D-form address, then just create
 732     // a new D-form address with a slot offset and the orignal base pointer.
 733     // Otherwise generate a D-form address with the slot offset relative
 734     // to the stack pointer, which is always aligned.
 735 #if !defined(NDEBUG)
 736       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
 737         cerr << "CellSPU LowerSTORE: basePtr = ";
 738         basePtr.getNode()->dump(&DAG);
 739         cerr << "\n";
 740       }
 741 #endif
 742
 743     SDValue insertEltOp =
 744             DAG.getNode(SPUISD::SHUFFLE_MASK, vecVT, insertEltOffs);
 745     SDValue vectorizeOp =
 746             DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue);
 747
 748     result = DAG.getNode(SPUISD::SHUFB, vecVT,
 749                          vectorizeOp, alignLoadVec,
 750                          DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, insertEltOp));
 751
 752     result = DAG.getStore(the_chain, result, basePtr,
 753                           LN->getSrcValue(), LN->getSrcValueOffset(),
 754                           LN->isVolatile(), LN->getAlignment());
 755
 756 #if 0 && !defined(NDEBUG)
 757     if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
 758       const SDValue &currentRoot = DAG.getRoot();
 759
 760       DAG.setRoot(result);
 761       cerr << "------- CellSPU:LowerStore result:\n";
 762       DAG.dump();
 763       cerr << "-------\n";
 764       DAG.setRoot(currentRoot);
 765     }
 766 #endif
 767
 768     return result;
 769     /*UNREACHED*/
 770   }
 771   case ISD::PRE_INC:
 772   case ISD::PRE_DEC:
 773   case ISD::POST_INC:
 774   case ISD::POST_DEC:
 775   case ISD::LAST_INDEXED_MODE:
 776     cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
 777             "UNINDEXED\n";
 778     cerr << (unsigned) SN->getAddressingMode() << "\n";
 779     abort();
 780     /*NOTREACHED*/
 781   }
 782
 783   return SDValue();
 784 }
 785
 786 //! Generate the address of a constant pool entry.
 787 SDValue
 788 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 789   MVT PtrVT = Op.getValueType();
 790   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 791   Constant *C = CP->getConstVal();
 792   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
 793   SDValue Zero = DAG.getConstant(0, PtrVT);
 794   const TargetMachine &TM = DAG.getTarget();
 795
 796   if (TM.getRelocationModel() == Reloc::Static) {
 797     if (!ST->usingLargeMem()) {
 798       // Just return the SDValue with the constant pool address in it.
 799       return DAG.getNode(SPUISD::AFormAddr, PtrVT, CPI, Zero);
 800     } else {
 801       SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, CPI, Zero);
 802       SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, CPI, Zero);
 803       return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
 804     }
 805   }
 806
 807   assert(0 &&
 808          "LowerConstantPool: Relocation model other than static"
 809          " not supported.");
 810   return SDValue();
 811 }
 812
 813 //! Alternate entry point for generating the address of a constant pool entry
 814 SDValue
 815 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
 816   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
 817 }
 818
 819 static SDValue
 820 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 821   MVT PtrVT = Op.getValueType();
 822   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 823   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
 824   SDValue Zero = DAG.getConstant(0, PtrVT);
 825   const TargetMachine &TM = DAG.getTarget();
 826
 827   if (TM.getRelocationModel() == Reloc::Static) {
 828     if (!ST->usingLargeMem()) {
 829       return DAG.getNode(SPUISD::AFormAddr, PtrVT, JTI, Zero);
 830     } else {
 831       SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, JTI, Zero);
 832       SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, JTI, Zero);
 833       return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
 834     }
 835   }
 836
 837   assert(0 &&
 838          "LowerJumpTable: Relocation model other than static not supported.");
 839   return SDValue();
 840 }
 841
 842 static SDValue
 843 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 844   MVT PtrVT = Op.getValueType();
 845   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
 846   GlobalValue *GV = GSDN->getGlobal();
 847   SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
 848   const TargetMachine &TM = DAG.getTarget();
 849   SDValue Zero = DAG.getConstant(0, PtrVT);
 850
 851   if (TM.getRelocationModel() == Reloc::Static) {
 852     if (!ST->usingLargeMem()) {
 853       return DAG.getNode(SPUISD::AFormAddr, PtrVT, GA, Zero);
 854     } else {
 855       SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, GA, Zero);
 856       SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, GA, Zero);
 857       return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
 858     }
 859   } else {
 860     cerr << "LowerGlobalAddress: Relocation model other than static not "
 861          << "supported.\n";
 862     abort();
 863     /*NOTREACHED*/
 864   }
 865
 866   return SDValue();
 867 }
 868
 869 //! Custom lower i64 integer constants
 870 /*!
 871  This code inserts all of the necessary juggling that needs to occur to load
 872  a 64-bit constant into a register.
 873  */
 874 static SDValue
 875 LowerConstant(SDValue Op, SelectionDAG &DAG) {
 876   MVT VT = Op.getValueType();
 877
 878   if (VT == MVT::i64) {
 879     ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
 880     SDValue T = DAG.getConstant(CN->getZExtValue(), VT);
 881     return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
 882                        DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
 883   } else {
 884     cerr << "LowerConstant: unhandled constant type "
 885          << VT.getMVTString()
 886          << "\n";
 887     abort();
 888     /*NOTREACHED*/
 889   }
 890
 891   return SDValue();
 892 }
 893
 894 //! Custom lower double precision floating point constants
 895 static SDValue
 896 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
 897   MVT VT = Op.getValueType();
 898
 899   if (VT == MVT::f64) {
 900     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
 901
 902     assert((FP != 0) &&
 903            "LowerConstantFP: Node is not ConstantFPSDNode");
 904
 905     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
 906     SDValue T = DAG.getConstant(dbits, MVT::i64);
 907     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T);
 908     return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
 909                        DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64, Tvec));
 910   }
 911
 912   return SDValue();
 913 }
 914
 915 static SDValue
 916 LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
 917 {
 918   MachineFunction &MF = DAG.getMachineFunction();
 919   MachineFrameInfo *MFI = MF.getFrameInfo();
 920   MachineRegisterInfo &RegInfo = MF.getRegInfo();
 921   SmallVector<SDValue, 48> ArgValues;
 922   SDValue Root = Op.getOperand(0);
 923   bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
 924
 925   const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
 926   const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
 927
 928   unsigned ArgOffset = SPUFrameInfo::minStackSize();
 929   unsigned ArgRegIdx = 0;
 930   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
 931
 932   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 933
 934   // Add DAG nodes to load the arguments or copy them out of registers.
 935   for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
 936        ArgNo != e; ++ArgNo) {
 937     MVT ObjectVT = Op.getValue(ArgNo).getValueType();
 938     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
 939     SDValue ArgVal;
 940
 941     if (ArgRegIdx < NumArgRegs) {
 942       const TargetRegisterClass *ArgRegClass;
 943
 944       switch (ObjectVT.getSimpleVT()) {
 945       default: {
 946         cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
 947              << ObjectVT.getMVTString()
 948              << "\n";
 949         abort();
 950       }
 951       case MVT::i8:
 952         ArgRegClass = &SPU::R8CRegClass;
 953         break;
 954       case MVT::i16:
 955         ArgRegClass = &SPU::R16CRegClass;
 956         break;
 957       case MVT::i32:
 958         ArgRegClass = &SPU::R32CRegClass;
 959         break;
 960       case MVT::i64:
 961         ArgRegClass = &SPU::R64CRegClass;
 962         break;
 963       case MVT::i128:
 964         ArgRegClass = &SPU::GPRCRegClass;
 965         break;
 966       case MVT::f32:
 967         ArgRegClass = &SPU::R32FPRegClass;
 968         break;
 969       case MVT::f64:
 970         ArgRegClass = &SPU::R64FPRegClass;
 971         break;
 972       case MVT::v2f64:
 973       case MVT::v4f32:
 974       case MVT::v2i64:
 975       case MVT::v4i32:
 976       case MVT::v8i16:
 977       case MVT::v16i8:
 978         ArgRegClass = &SPU::VECREGRegClass;
 979         break;
 980       }
 981
 982       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
 983       RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
 984       ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT);
 985       ++ArgRegIdx;
 986     } else {
 987       // We need to load the argument to a virtual register if we determined
 988       // above that we ran out of physical registers of the appropriate type
 989       // or we're forced to do vararg
 990       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
 991       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
 992       ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
 993       ArgOffset += StackSlotSize;
 994     }
 995
 996     ArgValues.push_back(ArgVal);
 997     // Update the chain
 998     Root = ArgVal.getOperand(0);
 999   }
1000
1001   // vararg handling:
1002   if (isVarArg) {
1003     // unsigned int ptr_size = PtrVT.getSizeInBits() / 8;
1004     // We will spill (79-3)+1 registers to the stack
1005     SmallVector<SDValue, 79-3+1> MemOps;
1006
1007     // Create the frame slot
1008
1009     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
1010       VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset);
1011       SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
1012       SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
1013       SDValue Store = DAG.getStore(Root, ArgVal, FIN, NULL, 0);
1014       Root = Store.getOperand(0);
1015       MemOps.push_back(Store);
1016
1017       // Increment address by stack slot size for the next stored argument
1018       ArgOffset += StackSlotSize;
1019     }
1020     if (!MemOps.empty())
1021       Root = DAG.getNode(ISD::TokenFactor,MVT::Other,&MemOps[0],MemOps.size());
1022   }
1023
1024   ArgValues.push_back(Root);
1025
1026   // Return the new list of results.
1027   return DAG.getNode(ISD::MERGE_VALUES, Op.getNode()->getVTList(),
1028                      &ArgValues[0], ArgValues.size());
1029 }
1030
1031 /// isLSAAddress - Return the immediate to use if the specified
1032 /// value is representable as a LSA address.
1033 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
1034   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
1035   if (!C) return 0;
1036
1037   int Addr = C->getZExtValue();
1038   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
1039       (Addr << 14 >> 14) != Addr)
1040     return 0;  // Top 14 bits have to be sext of immediate.
1041
1042   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
1043 }
1044
1045 static SDValue
1046 LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1047   CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
1048   SDValue Chain = TheCall->getChain();
1049   SDValue Callee    = TheCall->getCallee();
1050   unsigned NumOps     = TheCall->getNumArgs();
1051   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
1052   const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
1053   const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
1054
1055   // Handy pointer type
1056   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1057
1058   // Accumulate how many bytes are to be pushed on the stack, including the
1059   // linkage area, and parameter passing area.  According to the SPU ABI,
1060   // we minimally need space for [LR] and [SP]
1061   unsigned NumStackBytes = SPUFrameInfo::minStackSize();
1062
1063   // Set up a copy of the stack pointer for use loading and storing any
1064   // arguments that may not fit in the registers available for argument
1065   // passing.
1066   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
1067
1068   // Figure out which arguments are going to go in registers, and which in
1069   // memory.
1070   unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR]
1071   unsigned ArgRegIdx = 0;
1072
1073   // Keep track of registers passing arguments
1074   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
1075   // And the arguments passed on the stack
1076   SmallVector<SDValue, 8> MemOpChains;
1077
1078   for (unsigned i = 0; i != NumOps; ++i) {
1079     SDValue Arg = TheCall->getArg(i);
1080
1081     // PtrOff will be used to store the current argument to the stack if a
1082     // register cannot be found for it.
1083     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
1084     PtrOff = DAG.getNode(ISD::ADD, PtrVT, StackPtr, PtrOff);
1085
1086     switch (Arg.getValueType().getSimpleVT()) {
1087     default: assert(0 && "Unexpected ValueType for argument!");
1088     case MVT::i8:
1089     case MVT::i16:
1090     case MVT::i32:
1091     case MVT::i64:
1092     case MVT::i128:
1093       if (ArgRegIdx != NumArgRegs) {
1094         RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
1095       } else {
1096         MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
1097         ArgOffset += StackSlotSize;
1098       }
1099       break;
1100     case MVT::f32:
1101     case MVT::f64:
1102       if (ArgRegIdx != NumArgRegs) {
1103         RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
1104       } else {
1105         MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
1106         ArgOffset += StackSlotSize;
1107       }
1108       break;
1109     case MVT::v2i64:
1110     case MVT::v2f64:
1111     case MVT::v4f32:
1112     case MVT::v4i32:
1113     case MVT::v8i16:
1114     case MVT::v16i8:
1115       if (ArgRegIdx != NumArgRegs) {
1116         RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
1117       } else {
1118         MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
1119         ArgOffset += StackSlotSize;
1120       }
1121       break;
1122     }
1123   }
1124
1125   // Update number of stack bytes actually used, insert a call sequence start
1126   NumStackBytes = (ArgOffset - SPUFrameInfo::minStackSize());
1127   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
1128                                                             true));
1129
1130   if (!MemOpChains.empty()) {
1131     // Adjust the stack pointer for the stack arguments.
1132     Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
1133                         &MemOpChains[0], MemOpChains.size());
1134   }
1135
1136   // Build a sequence of copy-to-reg nodes chained together with token chain
1137   // and flag operands which copy the outgoing args into the appropriate regs.
1138   SDValue InFlag;
1139   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1140     Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
1141                              InFlag);
1142     InFlag = Chain.getValue(1);
1143   }
1144
1145   SmallVector<SDValue, 8> Ops;
1146   unsigned CallOpc = SPUISD::CALL;
1147
1148   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1149   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1150   // node so that legalize doesn't hack it.
1151   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1152     GlobalValue *GV = G->getGlobal();
1153     MVT CalleeVT = Callee.getValueType();
1154     SDValue Zero = DAG.getConstant(0, PtrVT);
1155     SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT);
1156
1157     if (!ST->usingLargeMem()) {
1158       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
1159       // style calls, otherwise, external symbols are BRASL calls. This assumes
1160       // that declared/defined symbols are in the same compilation unit and can
1161       // be reached through PC-relative jumps.
1162       //
1163       // NOTE:
1164       // This may be an unsafe assumption for JIT and really large compilation
1165       // units.
1166       if (GV->isDeclaration()) {
1167         Callee = DAG.getNode(SPUISD::AFormAddr, CalleeVT, GA, Zero);
1168       } else {
1169         Callee = DAG.getNode(SPUISD::PCRelAddr, CalleeVT, GA, Zero);
1170       }
1171     } else {
1172       // "Large memory" mode: Turn all calls into indirect calls with a X-form
1173       // address pairs:
1174       Callee = DAG.getNode(SPUISD::IndirectAddr, PtrVT, GA, Zero);
1175     }
1176   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1177     MVT CalleeVT = Callee.getValueType();
1178     SDValue Zero = DAG.getConstant(0, PtrVT);
1179     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
1180         Callee.getValueType());
1181
1182     if (!ST->usingLargeMem()) {
1183       Callee = DAG.getNode(SPUISD::AFormAddr, CalleeVT, ExtSym, Zero);
1184     } else {
1185       Callee = DAG.getNode(SPUISD::IndirectAddr, PtrVT, ExtSym, Zero);
1186     }
1187   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
1188     // If this is an absolute destination address that appears to be a legal
1189     // local store address, use the munged value.
1190     Callee = SDValue(Dest, 0);
1191   }
1192
1193   Ops.push_back(Chain);
1194   Ops.push_back(Callee);
1195
1196   // Add argument registers to the end of the list so that they are known live
1197   // into the call.
1198   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1199     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1200                                   RegsToPass[i].second.getValueType()));
1201
1202   if (InFlag.getNode())
1203     Ops.push_back(InFlag);
1204   // Returns a chain and a flag for retval copy to use.
1205   Chain = DAG.getNode(CallOpc, DAG.getVTList(MVT::Other, MVT::Flag),
1206                       &Ops[0], Ops.size());
1207   InFlag = Chain.getValue(1);
1208
1209   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
1210                              DAG.getIntPtrConstant(0, true), InFlag);
1211   if (TheCall->getValueType(0) != MVT::Other)
1212     InFlag = Chain.getValue(1);
1213
1214   SDValue ResultVals[3];
1215   unsigned NumResults = 0;
1216
1217   // If the call has results, copy the values out of the ret val registers.
1218   switch (TheCall->getValueType(0).getSimpleVT()) {
1219   default: assert(0 && "Unexpected ret value!");
1220   case MVT::Other: break;
1221   case MVT::i32:
1222     if (TheCall->getValueType(1) == MVT::i32) {
1223       Chain = DAG.getCopyFromReg(Chain, SPU::R4, MVT::i32, InFlag).getValue(1);
1224       ResultVals[0] = Chain.getValue(0);
1225       Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32,
1226                                  Chain.getValue(2)).getValue(1);
1227       ResultVals[1] = Chain.getValue(0);
1228       NumResults = 2;
1229     } else {
1230       Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32, InFlag).getValue(1);
1231       ResultVals[0] = Chain.getValue(0);
1232       NumResults = 1;
1233     }
1234     break;
1235   case MVT::i64:
1236     Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i64, InFlag).getValue(1);
1237     ResultVals[0] = Chain.getValue(0);
1238     NumResults = 1;
1239     break;
1240   case MVT::i128:
1241     Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i128, InFlag).getValue(1);
1242     ResultVals[0] = Chain.getValue(0);
1243     NumResults = 1;
1244     break;
1245   case MVT::f32:
1246   case MVT::f64:
1247     Chain = DAG.getCopyFromReg(Chain, SPU::R3, TheCall->getValueType(0),
1248                                InFlag).getValue(1);
1249     ResultVals[0] = Chain.getValue(0);
1250     NumResults = 1;
1251     break;
1252   case MVT::v2f64:
1253   case MVT::v2i64:
1254   case MVT::v4f32:
1255   case MVT::v4i32:
1256   case MVT::v8i16:
1257   case MVT::v16i8:
1258     Chain = DAG.getCopyFromReg(Chain, SPU::R3, TheCall->getValueType(0),
1259                                    InFlag).getValue(1);
1260     ResultVals[0] = Chain.getValue(0);
1261     NumResults = 1;
1262     break;
1263   }
1264
1265   // If the function returns void, just return the chain.
1266   if (NumResults == 0)
1267     return Chain;
1268
1269   // Otherwise, merge everything together with a MERGE_VALUES node.
1270   ResultVals[NumResults++] = Chain;
1271   SDValue Res = DAG.getMergeValues(ResultVals, NumResults);
1272   return Res.getValue(Op.getResNo());
1273 }
1274
1275 static SDValue
1276 LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
1277   SmallVector<CCValAssign, 16> RVLocs;
1278   unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
1279   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
1280   CCState CCInfo(CC, isVarArg, TM, RVLocs);
1281   CCInfo.AnalyzeReturn(Op.getNode(), RetCC_SPU);
1282
1283   // If this is the first return lowered for this function, add the regs to the
1284   // liveout set for the function.
1285   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1286     for (unsigned i = 0; i != RVLocs.size(); ++i)
1287       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1288   }
1289
1290   SDValue Chain = Op.getOperand(0);
1291   SDValue Flag;
1292
1293   // Copy the result values into the output registers.
1294   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1295     CCValAssign &VA = RVLocs[i];
1296     assert(VA.isRegLoc() && "Can only return in registers!");
1297     Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), Flag);
1298     Flag = Chain.getValue(1);
1299   }
1300
1301   if (Flag.getNode())
1302     return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain, Flag);
1303   else
1304     return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain);
1305 }
1306
1307
1308 //===----------------------------------------------------------------------===//
1309 // Vector related lowering:
1310 //===----------------------------------------------------------------------===//
1311
1312 static ConstantSDNode *
1313 getVecImm(SDNode *N) {
1314   SDValue OpVal(0, 0);
1315
1316   // Check to see if this buildvec has a single non-undef value in its elements.
1317   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1318     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
1319     if (OpVal.getNode() == 0)
1320       OpVal = N->getOperand(i);
1321     else if (OpVal != N->getOperand(i))
1322       return 0;
1323   }
1324
1325   if (OpVal.getNode() != 0) {
1326     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1327       return CN;
1328     }
1329   }
1330
1331   return 0; // All UNDEF: use implicit def.; not Constant node
1332 }
1333
1334 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
1335 /// and the value fits into an unsigned 18-bit constant, and if so, return the
1336 /// constant
1337 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
1338                               MVT ValueType) {
1339   if (ConstantSDNode *CN = getVecImm(N)) {
1340     uint64_t Value = CN->getZExtValue();
1341     if (ValueType == MVT::i64) {
1342       uint64_t UValue = CN->getZExtValue();
1343       uint32_t upper = uint32_t(UValue >> 32);
1344       uint32_t lower = uint32_t(UValue);
1345       if (upper != lower)
1346         return SDValue();
1347       Value = Value >> 32;
1348     }
1349     if (Value <= 0x3ffff)
1350       return DAG.getTargetConstant(Value, ValueType);
1351   }
1352
1353   return SDValue();
1354 }
1355
1356 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
1357 /// and the value fits into a signed 16-bit constant, and if so, return the
1358 /// constant
1359 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
1360                               MVT ValueType) {
1361   if (ConstantSDNode *CN = getVecImm(N)) {
1362     int64_t Value = CN->getSExtValue();
1363     if (ValueType == MVT::i64) {
1364       uint64_t UValue = CN->getZExtValue();
1365       uint32_t upper = uint32_t(UValue >> 32);
1366       uint32_t lower = uint32_t(UValue);
1367       if (upper != lower)
1368         return SDValue();
1369       Value = Value >> 32;
1370     }
1371     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
1372       return DAG.getTargetConstant(Value, ValueType);
1373     }
1374   }
1375
1376   return SDValue();
1377 }
1378
1379 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
1380 /// and the value fits into a signed 10-bit constant, and if so, return the
1381 /// constant
1382 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
1383                               MVT ValueType) {
1384   if (ConstantSDNode *CN = getVecImm(N)) {
1385     int64_t Value = CN->getSExtValue();
1386     if (ValueType == MVT::i64) {
1387       uint64_t UValue = CN->getZExtValue();
1388       uint32_t upper = uint32_t(UValue >> 32);
1389       uint32_t lower = uint32_t(UValue);
1390       if (upper != lower)
1391         return SDValue();
1392       Value = Value >> 32;
1393     }
1394     if (isS10Constant(Value))
1395       return DAG.getTargetConstant(Value, ValueType);
1396   }
1397
1398   return SDValue();
1399 }
1400
1401 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
1402 /// and the value fits into a signed 8-bit constant, and if so, return the
1403 /// constant.
1404 ///
1405 /// @note: The incoming vector is v16i8 because that's the only way we can load
1406 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
1407 /// same value.
1408 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
1409                              MVT ValueType) {
1410   if (ConstantSDNode *CN = getVecImm(N)) {
1411     int Value = (int) CN->getZExtValue();
1412     if (ValueType == MVT::i16
1413         && Value <= 0xffff                 /* truncated from uint64_t */
1414         && ((short) Value >> 8) == ((short) Value & 0xff))
1415       return DAG.getTargetConstant(Value & 0xff, ValueType);
1416     else if (ValueType == MVT::i8
1417              && (Value & 0xff) == Value)
1418       return DAG.getTargetConstant(Value, ValueType);
1419   }
1420
1421   return SDValue();
1422 }
1423
1424 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
1425 /// and the value fits into a signed 16-bit constant, and if so, return the
1426 /// constant
1427 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
1428                                MVT ValueType) {
1429   if (ConstantSDNode *CN = getVecImm(N)) {
1430     uint64_t Value = CN->getZExtValue();
1431     if ((ValueType == MVT::i32
1432           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
1433         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
1434       return DAG.getTargetConstant(Value >> 16, ValueType);
1435   }
1436
1437   return SDValue();
1438 }
1439
1440 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
1441 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
1442   if (ConstantSDNode *CN = getVecImm(N)) {
1443     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
1444   }
1445
1446   return SDValue();
1447 }
1448
1449 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
1450 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
1451   if (ConstantSDNode *CN = getVecImm(N)) {
1452     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
1453   }
1454
1455   return SDValue();
1456 }
1457
1458 // If this is a vector of constants or undefs, get the bits.  A bit in
1459 // UndefBits is set if the corresponding element of the vector is an
1460 // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
1461 // zero.   Return true if this is not an array of constants, false if it is.
1462 //
1463 static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2],
1464                                        uint64_t UndefBits[2]) {
1465   // Start with zero'd results.
1466   VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0;
1467
1468   unsigned EltBitSize = BV->getOperand(0).getValueType().getSizeInBits();
1469   for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
1470     SDValue OpVal = BV->getOperand(i);
1471
1472     unsigned PartNo = i >= e/2;     // In the upper 128 bits?
1473     unsigned SlotNo = e/2 - (i & (e/2-1))-1;  // Which subpiece of the uint64_t.
1474
1475     uint64_t EltBits = 0;
1476     if (OpVal.getOpcode() == ISD::UNDEF) {
1477       uint64_t EltUndefBits = ~0ULL >> (64-EltBitSize);
1478       UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize);
1479       continue;
1480     } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1481       EltBits = CN->getZExtValue() & (~0ULL >> (64-EltBitSize));
1482     } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
1483       const APFloat &apf = CN->getValueAPF();
1484       EltBits = (CN->getValueType(0) == MVT::f32
1485                  ? FloatToBits(apf.convertToFloat())
1486                  : DoubleToBits(apf.convertToDouble()));
1487     } else {
1488       // Nonconstant element.
1489       return true;
1490     }
1491
1492     VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize);
1493   }
1494
1495   //printf("%llx %llx  %llx %llx\n",
1496   //       VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]);
1497   return false;
1498 }
1499
1500 /// If this is a splat (repetition) of a value across the whole vector, return
1501 /// the smallest size that splats it.  For example, "0x01010101010101..." is a
1502 /// splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and
1503 /// SplatSize = 1 byte.
1504 static bool isConstantSplat(const uint64_t Bits128[2],
1505                             const uint64_t Undef128[2],
1506                             int MinSplatBits,
1507                             uint64_t &SplatBits, uint64_t &SplatUndef,
1508                             int &SplatSize) {
1509   // Don't let undefs prevent splats from matching.  See if the top 64-bits are
1510   // the same as the lower 64-bits, ignoring undefs.
1511   uint64_t Bits64  = Bits128[0] | Bits128[1];
1512   uint64_t Undef64 = Undef128[0] & Undef128[1];
1513   uint32_t Bits32  = uint32_t(Bits64) | uint32_t(Bits64 >> 32);
1514   uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32);
1515   uint16_t Bits16  = uint16_t(Bits32)  | uint16_t(Bits32 >> 16);
1516   uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16);
1517
1518   if ((Bits128[0] & ~Undef128[1]) == (Bits128[1] & ~Undef128[0])) {
1519     if (MinSplatBits < 64) {
1520
1521       // Check that the top 32-bits are the same as the lower 32-bits, ignoring
1522       // undefs.
1523       if ((Bits64 & (~Undef64 >> 32)) == ((Bits64 >> 32) & ~Undef64)) {
1524         if (MinSplatBits < 32) {
1525
1526           // If the top 16-bits are different than the lower 16-bits, ignoring
1527           // undefs, we have an i32 splat.
1528           if ((Bits32 & (~Undef32 >> 16)) == ((Bits32 >> 16) & ~Undef32)) {
1529             if (MinSplatBits < 16) {
1530               // If the top 8-bits are different than the lower 8-bits, ignoring
1531               // undefs, we have an i16 splat.
1532               if ((Bits16 & (uint16_t(~Undef16) >> 8))
1533                   == ((Bits16 >> 8) & ~Undef16)) {
1534                 // Otherwise, we have an 8-bit splat.
1535                 SplatBits  = uint8_t(Bits16)  | uint8_t(Bits16 >> 8);
1536                 SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8);
1537                 SplatSize = 1;
1538                 return true;
1539               }
1540             } else {
1541               SplatBits = Bits16;
1542               SplatUndef = Undef16;
1543               SplatSize = 2;
1544               return true;
1545             }
1546           }
1547         } else {
1548           SplatBits = Bits32;
1549           SplatUndef = Undef32;
1550           SplatSize = 4;
1551           return true;
1552         }
1553       }
1554     } else {
1555       SplatBits = Bits128[0];
1556       SplatUndef = Undef128[0];
1557       SplatSize = 8;
1558       return true;
1559     }
1560   }
1561
1562   return false;  // Can't be a splat if two pieces don't match.
1563 }
1564
1565 //! Lower a BUILD_VECTOR instruction creatively:
1566 SDValue
1567 SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
1568   MVT VT = Op.getValueType();
1569   // If this is a vector of constants or undefs, get the bits.  A bit in
1570   // UndefBits is set if the corresponding element of the vector is an
1571   // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
1572   // zero.
1573   uint64_t VectorBits[2];
1574   uint64_t UndefBits[2];
1575   uint64_t SplatBits, SplatUndef;
1576   int SplatSize;
1577   if (GetConstantBuildVectorBits(Op.getNode(), VectorBits, UndefBits)
1578       || !isConstantSplat(VectorBits, UndefBits,
1579                           VT.getVectorElementType().getSizeInBits(),
1580                           SplatBits, SplatUndef, SplatSize))
1581     return SDValue();   // Not a constant vector, not a splat.
1582
1583   switch (VT.getSimpleVT()) {
1584   default:
1585     cerr << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
1586          << VT.getMVTString()
1587          << "\n";
1588     abort();
1589     /*NOTREACHED*/
1590   case MVT::v4f32: {
1591     uint32_t Value32 = SplatBits;
1592     assert(SplatSize == 4
1593            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
1594     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1595     SDValue T = DAG.getConstant(Value32, MVT::i32);
1596     return DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32,
1597                        DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, T, T, T, T));
1598     break;
1599   }
1600   case MVT::v2f64: {
1601     uint64_t f64val = SplatBits;
1602     assert(SplatSize == 8
1603            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
1604     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1605     SDValue T = DAG.getConstant(f64val, MVT::i64);
1606     return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64,
1607                        DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
1608     break;
1609   }
1610   case MVT::v16i8: {
1611    // 8-bit constants have to be expanded to 16-bits
1612    unsigned short Value16 = SplatBits | (SplatBits << 8);
1613    SDValue Ops[8];
1614    for (int i = 0; i < 8; ++i)
1615      Ops[i] = DAG.getConstant(Value16, MVT::i16);
1616    return DAG.getNode(ISD::BIT_CONVERT, VT,
1617                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v8i16, Ops, 8));
1618   }
1619   case MVT::v8i16: {
1620     unsigned short Value16;
1621     if (SplatSize == 2)
1622       Value16 = (unsigned short) (SplatBits & 0xffff);
1623     else
1624       Value16 = (unsigned short) (SplatBits | (SplatBits << 8));
1625     SDValue T = DAG.getConstant(Value16, VT.getVectorElementType());
1626     SDValue Ops[8];
1627     for (int i = 0; i < 8; ++i) Ops[i] = T;
1628     return DAG.getNode(ISD::BUILD_VECTOR, VT, Ops, 8);
1629   }
1630   case MVT::v4i32: {
1631     unsigned int Value = SplatBits;
1632     SDValue T = DAG.getConstant(Value, VT.getVectorElementType());
1633     return DAG.getNode(ISD::BUILD_VECTOR, VT, T, T, T, T);
1634   }
1635   case MVT::v2i32: {
1636     unsigned int Value = SplatBits;
1637     SDValue T = DAG.getConstant(Value, VT.getVectorElementType());
1638     return DAG.getNode(ISD::BUILD_VECTOR, VT, T, T);
1639   }
1640   case MVT::v2i64: {
1641     uint64_t val = SplatBits;
1642     uint32_t upper = uint32_t(val >> 32);
1643     uint32_t lower = uint32_t(val);
1644
1645     if (upper == lower) {
1646       // Magic constant that can be matched by IL, ILA, et. al.
1647       SDValue Val = DAG.getTargetConstant(val, MVT::i64);
1648       return DAG.getNode(ISD::BUILD_VECTOR, VT, Val, Val);
1649     } else {
1650       SDValue LO32;
1651       SDValue HI32;
1652       SmallVector<SDValue, 16> ShufBytes;
1653       SDValue Result;
1654       bool upper_special, lower_special;
1655
1656       // NOTE: This code creates common-case shuffle masks that can be easily
1657       // detected as common expressions. It is not attempting to create highly
1658       // specialized masks to replace any and all 0's, 0xff's and 0x80's.
1659
1660       // Detect if the upper or lower half is a special shuffle mask pattern:
1661       upper_special = (upper == 0||upper == 0xffffffff||upper == 0x80000000);
1662       lower_special = (lower == 0||lower == 0xffffffff||lower == 0x80000000);
1663
1664       // Create lower vector if not a special pattern
1665       if (!lower_special) {
1666         SDValue LO32C = DAG.getConstant(lower, MVT::i32);
1667         LO32 = DAG.getNode(ISD::BIT_CONVERT, VT,
1668                            DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
1669                                        LO32C, LO32C, LO32C, LO32C));
1670       }
1671
1672       // Create upper vector if not a special pattern
1673       if (!upper_special) {
1674         SDValue HI32C = DAG.getConstant(upper, MVT::i32);
1675         HI32 = DAG.getNode(ISD::BIT_CONVERT, VT,
1676                            DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
1677                                        HI32C, HI32C, HI32C, HI32C));
1678       }
1679
1680       // If either upper or lower are special, then the two input operands are
1681       // the same (basically, one of them is a "don't care")
1682       if (lower_special)
1683         LO32 = HI32;
1684       if (upper_special)
1685         HI32 = LO32;
1686       if (lower_special && upper_special) {
1687         // Unhappy situation... both upper and lower are special, so punt with
1688         // a target constant:
1689         SDValue Zero = DAG.getConstant(0, MVT::i32);
1690         HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Zero, Zero,
1691                                   Zero, Zero);
1692       }
1693
1694       for (int i = 0; i < 4; ++i) {
1695         uint64_t val = 0;
1696         for (int j = 0; j < 4; ++j) {
1697           SDValue V;
1698           bool process_upper, process_lower;
1699           val <<= 8;
1700           process_upper = (upper_special && (i & 1) == 0);
1701           process_lower = (lower_special && (i & 1) == 1);
1702
1703           if (process_upper || process_lower) {
1704             if ((process_upper && upper == 0)
1705                 || (process_lower && lower == 0))
1706               val |= 0x80;
1707             else if ((process_upper && upper == 0xffffffff)
1708                      || (process_lower && lower == 0xffffffff))
1709               val |= 0xc0;
1710             else if ((process_upper && upper == 0x80000000)
1711                      || (process_lower && lower == 0x80000000))
1712               val |= (j == 0 ? 0xe0 : 0x80);
1713           } else
1714             val |= i * 4 + j + ((i & 1) * 16);
1715         }
1716
1717         ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
1718       }
1719
1720       return DAG.getNode(SPUISD::SHUFB, VT, HI32, LO32,
1721                          DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
1722                                      &ShufBytes[0], ShufBytes.size()));
1723     }
1724   }
1725   }
1726
1727   return SDValue();
1728 }
1729
1730 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
1731 /// which the Cell can operate. The code inspects V3 to ascertain whether the
1732 /// permutation vector, V3, is monotonically increasing with one "exception"
1733 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
1734 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
1735 /// In either case, the net result is going to eventually invoke SHUFB to
1736 /// permute/shuffle the bytes from V1 and V2.
1737 /// \note
1738 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
1739 /// control word for byte/halfword/word insertion. This takes care of a single
1740 /// element move from V2 into V1.
1741 /// \note
1742 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
1743 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
1744   SDValue V1 = Op.getOperand(0);
1745   SDValue V2 = Op.getOperand(1);
1746   SDValue PermMask = Op.getOperand(2);
1747
1748   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
1749
1750   // If we have a single element being moved from V1 to V2, this can be handled
1751   // using the C*[DX] compute mask instructions, but the vector elements have
1752   // to be monotonically increasing with one exception element.
1753   MVT VecVT = V1.getValueType();
1754   MVT EltVT = VecVT.getVectorElementType();
1755   unsigned EltsFromV2 = 0;
1756   unsigned V2Elt = 0;
1757   unsigned V2EltIdx0 = 0;
1758   unsigned CurrElt = 0;
1759   unsigned MaxElts = VecVT.getVectorNumElements();
1760   unsigned PrevElt = 0;
1761   unsigned V0Elt = 0;
1762   bool monotonic = true;
1763   bool rotate = true;
1764
1765   if (EltVT == MVT::i8) {
1766     V2EltIdx0 = 16;
1767   } else if (EltVT == MVT::i16) {
1768     V2EltIdx0 = 8;
1769   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
1770     V2EltIdx0 = 4;
1771   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
1772     V2EltIdx0 = 2;
1773   } else
1774     assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");
1775
1776   for (unsigned i = 0; i != PermMask.getNumOperands(); ++i) {
1777     if (PermMask.getOperand(i).getOpcode() != ISD::UNDEF) {
1778       unsigned SrcElt = cast<ConstantSDNode > (PermMask.getOperand(i))->getZExtValue();
1779
1780       if (monotonic) {
1781         if (SrcElt >= V2EltIdx0) {
1782           if (1 >= (++EltsFromV2)) {
1783             V2Elt = (V2EltIdx0 - SrcElt) << 2;
1784           }
1785         } else if (CurrElt != SrcElt) {
1786           monotonic = false;
1787         }
1788
1789         ++CurrElt;
1790       }
1791
1792       if (rotate) {
1793         if (PrevElt > 0 && SrcElt < MaxElts) {
1794           if ((PrevElt == SrcElt - 1)
1795               || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
1796             PrevElt = SrcElt;
1797             if (SrcElt == 0)
1798               V0Elt = i;
1799           } else {
1800             rotate = false;
1801           }
1802         } else if (PrevElt == 0) {
1803           // First time through, need to keep track of previous element
1804           PrevElt = SrcElt;
1805         } else {
1806           // This isn't a rotation, takes elements from vector 2
1807           rotate = false;
1808         }
1809       }
1810     }
1811   }
1812
1813   if (EltsFromV2 == 1 && monotonic) {
1814     // Compute mask and shuffle
1815     MachineFunction &MF = DAG.getMachineFunction();
1816     MachineRegisterInfo &RegInfo = MF.getRegInfo();
1817     unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
1818     MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1819     // Initialize temporary register to 0
1820     SDValue InitTempReg =
1821       DAG.getCopyToReg(DAG.getEntryNode(), VReg, DAG.getConstant(0, PtrVT));
1822     // Copy register's contents as index in SHUFFLE_MASK:
1823     SDValue ShufMaskOp =
1824       DAG.getNode(SPUISD::SHUFFLE_MASK, MVT::v4i32,
1825                   DAG.getTargetConstant(V2Elt, MVT::i32),
1826                   DAG.getCopyFromReg(InitTempReg, VReg, PtrVT));
1827     // Use shuffle mask in SHUFB synthetic instruction:
1828     return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V2, V1, ShufMaskOp);
1829   } else if (rotate) {
1830     int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8;
1831
1832     return DAG.getNode(SPUISD::ROTBYTES_LEFT, V1.getValueType(),
1833                        V1, DAG.getConstant(rotamt, MVT::i16));
1834   } else {
1835    // Convert the SHUFFLE_VECTOR mask's input element units to the
1836    // actual bytes.
1837     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
1838
1839     SmallVector<SDValue, 16> ResultMask;
1840     for (unsigned i = 0, e = PermMask.getNumOperands(); i != e; ++i) {
1841       unsigned SrcElt;
1842       if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
1843         SrcElt = 0;
1844       else
1845         SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getZExtValue();
1846
1847       for (unsigned j = 0; j < BytesPerElement; ++j) {
1848         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
1849                                              MVT::i8));
1850       }
1851     }
1852
1853     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
1854                                     &ResultMask[0], ResultMask.size());
1855     return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V1, V2, VPermMask);
1856   }
1857 }
1858
1859 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
1860   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
1861
1862   if (Op0.getNode()->getOpcode() == ISD::Constant) {
1863     // For a constant, build the appropriate constant vector, which will
1864     // eventually simplify to a vector register load.
1865
1866     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
1867     SmallVector<SDValue, 16> ConstVecValues;
1868     MVT VT;
1869     size_t n_copies;
1870
1871     // Create a constant vector:
1872     switch (Op.getValueType().getSimpleVT()) {
1873     default: assert(0 && "Unexpected constant value type in "
1874                          "LowerSCALAR_TO_VECTOR");
1875     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
1876     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
1877     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
1878     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
1879     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
1880     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
1881     }
1882
1883     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
1884     for (size_t j = 0; j < n_copies; ++j)
1885       ConstVecValues.push_back(CValue);
1886
1887     return DAG.getNode(ISD::BUILD_VECTOR, Op.getValueType(),
1888                        &ConstVecValues[0], ConstVecValues.size());
1889   } else {
1890     // Otherwise, copy the value from one register to another:
1891     switch (Op0.getValueType().getSimpleVT()) {
1892     default: assert(0 && "Unexpected value type in LowerSCALAR_TO_VECTOR");
1893     case MVT::i8:
1894     case MVT::i16:
1895     case MVT::i32:
1896     case MVT::i64:
1897     case MVT::f32:
1898     case MVT::f64:
1899       return DAG.getNode(SPUISD::PREFSLOT2VEC, Op.getValueType(), Op0, Op0);
1900     }
1901   }
1902
1903   return SDValue();
1904 }
1905
1906 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
1907   MVT VT = Op.getValueType();
1908   SDValue N = Op.getOperand(0);
1909   SDValue Elt = Op.getOperand(1);
1910   SDValue retval;
1911
1912   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
1913     // Constant argument:
1914     int EltNo = (int) C->getZExtValue();
1915
1916     // sanity checks:
1917     if (VT == MVT::i8 && EltNo >= 16)
1918       assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
1919     else if (VT == MVT::i16 && EltNo >= 8)
1920       assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
1921     else if (VT == MVT::i32 && EltNo >= 4)
1922       assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
1923     else if (VT == MVT::i64 && EltNo >= 2)
1924       assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
1925
1926     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
1927       // i32 and i64: Element 0 is the preferred slot
1928       return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, N);
1929     }
1930
1931     // Need to generate shuffle mask and extract:
1932     int prefslot_begin = -1, prefslot_end = -1;
1933     int elt_byte = EltNo * VT.getSizeInBits() / 8;
1934
1935     switch (VT.getSimpleVT()) {
1936     default:
1937       assert(false && "Invalid value type!");
1938     case MVT::i8: {
1939       prefslot_begin = prefslot_end = 3;
1940       break;
1941     }
1942     case MVT::i16: {
1943       prefslot_begin = 2; prefslot_end = 3;
1944       break;
1945     }
1946     case MVT::i32:
1947     case MVT::f32: {
1948       prefslot_begin = 0; prefslot_end = 3;
1949       break;
1950     }
1951     case MVT::i64:
1952     case MVT::f64: {
1953       prefslot_begin = 0; prefslot_end = 7;
1954       break;
1955     }
1956     }
1957
1958     assert(prefslot_begin != -1 && prefslot_end != -1 &&
1959            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
1960
1961     unsigned int ShufBytes[16];
1962     for (int i = 0; i < 16; ++i) {
1963       // zero fill uppper part of preferred slot, don't care about the
1964       // other slots:
1965       unsigned int mask_val;
1966       if (i <= prefslot_end) {
1967         mask_val =
1968           ((i < prefslot_begin)
1969            ? 0x80
1970            : elt_byte + (i - prefslot_begin));
1971
1972         ShufBytes[i] = mask_val;
1973       } else
1974         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
1975     }
1976
1977     SDValue ShufMask[4];
1978     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
1979       unsigned bidx = i * 4;
1980       unsigned int bits = ((ShufBytes[bidx] << 24) |
1981                            (ShufBytes[bidx+1] << 16) |
1982                            (ShufBytes[bidx+2] << 8) |
1983                            ShufBytes[bidx+3]);
1984       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
1985     }
1986
1987     SDValue ShufMaskVec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
1988                                       &ShufMask[0],
1989                                       sizeof(ShufMask) / sizeof(ShufMask[0]));
1990
1991     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
1992                          DAG.getNode(SPUISD::SHUFB, N.getValueType(),
1993                                      N, N, ShufMaskVec));
1994   } else {
1995     // Variable index: Rotate the requested element into slot 0, then replicate
1996     // slot 0 across the vector
1997     MVT VecVT = N.getValueType();
1998     if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
1999       cerr << "LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit vector type!\n";
2000       abort();
2001     }
2002
2003     // Make life easier by making sure the index is zero-extended to i32
2004     if (Elt.getValueType() != MVT::i32)
2005       Elt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Elt);
2006
2007     // Scale the index to a bit/byte shift quantity
2008     APInt scaleFactor =
2009             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
2010     unsigned scaleShift = scaleFactor.logBase2();
2011     SDValue vecShift;
2012
2013     if (scaleShift > 0) {
2014       // Scale the shift factor:
2015       Elt = DAG.getNode(ISD::SHL, MVT::i32, Elt,
2016                         DAG.getConstant(scaleShift, MVT::i32));
2017     }
2018
2019     vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT, N, Elt);
2020
2021     // Replicate the bytes starting at byte 0 across the entire vector (for
2022     // consistency with the notion of a unified register set)
2023     SDValue replicate;
2024
2025     switch (VT.getSimpleVT()) {
2026     default:
2027       cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n";
2028       abort();
2029       /*NOTREACHED*/
2030     case MVT::i8: {
2031       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
2032       replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
2033                               factor, factor);
2034       break;
2035     }
2036     case MVT::i16: {
2037       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
2038       replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
2039                               factor, factor);
2040       break;
2041     }
2042     case MVT::i32:
2043     case MVT::f32: {
2044       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
2045       replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
2046                               factor, factor);
2047       break;
2048     }
2049     case MVT::i64:
2050     case MVT::f64: {
2051       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
2052       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
2053       replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, loFactor, hiFactor,
2054                               loFactor, hiFactor);
2055       break;
2056     }
2057     }
2058
2059     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
2060                          DAG.getNode(SPUISD::SHUFB, VecVT,
2061                                      vecShift, vecShift, replicate));
2062   }
2063
2064   return retval;
2065 }
2066
2067 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2068   SDValue VecOp = Op.getOperand(0);
2069   SDValue ValOp = Op.getOperand(1);
2070   SDValue IdxOp = Op.getOperand(2);
2071   MVT VT = Op.getValueType();
2072
2073   ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
2074   assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
2075
2076   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2077   // Use $sp ($1) because it's always 16-byte aligned and it's available:
2078   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
2079                                 DAG.getRegister(SPU::R1, PtrVT),
2080                                 DAG.getConstant(CN->getSExtValue(), PtrVT));
2081   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, VT, Pointer);
2082
2083   SDValue result =
2084     DAG.getNode(SPUISD::SHUFB, VT,
2085                 DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, ValOp),
2086                 VecOp,
2087                 DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, ShufMask));
2088
2089   return result;
2090 }
2091
2092 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
2093                            const TargetLowering &TLI)
2094 {
2095   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
2096   MVT ShiftVT = TLI.getShiftAmountTy();
2097
2098   assert(Op.getValueType() == MVT::i8);
2099   switch (Opc) {
2100   default:
2101     assert(0 && "Unhandled i8 math operator");
2102     /*NOTREACHED*/
2103     break;
2104   case ISD::ADD: {
2105     // 8-bit addition: Promote the arguments up to 16-bits and truncate
2106     // the result:
2107     SDValue N1 = Op.getOperand(1);
2108     N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0);
2109     N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1);
2110     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
2111                        DAG.getNode(Opc, MVT::i16, N0, N1));
2112
2113   }
2114
2115   case ISD::SUB: {
2116     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
2117     // the result:
2118     SDValue N1 = Op.getOperand(1);
2119     N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0);
2120     N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1);
2121     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
2122                        DAG.getNode(Opc, MVT::i16, N0, N1));
2123   }
2124   case ISD::ROTR:
2125   case ISD::ROTL: {
2126     SDValue N1 = Op.getOperand(1);
2127     unsigned N1Opc;
2128     N0 = (N0.getOpcode() != ISD::Constant
2129           ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
2130           : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
2131                             MVT::i16));
2132     N1Opc = N1.getValueType().bitsLT(ShiftVT)
2133             ? ISD::ZERO_EXTEND
2134             : ISD::TRUNCATE;
2135     N1 = (N1.getOpcode() != ISD::Constant
2136           ? DAG.getNode(N1Opc, ShiftVT, N1)
2137           : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
2138                             TLI.getShiftAmountTy()));
2139     SDValue ExpandArg =
2140       DAG.getNode(ISD::OR, MVT::i16, N0,
2141                   DAG.getNode(ISD::SHL, MVT::i16,
2142                               N0, DAG.getConstant(8, MVT::i32)));
2143     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
2144                        DAG.getNode(Opc, MVT::i16, ExpandArg, N1));
2145   }
2146   case ISD::SRL:
2147   case ISD::SHL: {
2148     SDValue N1 = Op.getOperand(1);
2149     unsigned N1Opc;
2150     N0 = (N0.getOpcode() != ISD::Constant
2151           ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
2152           : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
2153                             MVT::i32));
2154     N1Opc = N1.getValueType().bitsLT(ShiftVT)
2155             ? ISD::ZERO_EXTEND
2156             : ISD::TRUNCATE;
2157     N1 = (N1.getOpcode() != ISD::Constant
2158           ? DAG.getNode(N1Opc, ShiftVT, N1)
2159           : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(), ShiftVT));
2160     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
2161                        DAG.getNode(Opc, MVT::i16, N0, N1));
2162   }
2163   case ISD::SRA: {
2164     SDValue N1 = Op.getOperand(1);
2165     unsigned N1Opc;
2166     N0 = (N0.getOpcode() != ISD::Constant
2167           ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
2168           : DAG.getConstant(cast<ConstantSDNode>(N0)->getSExtValue(),
2169                             MVT::i16));
2170     N1Opc = N1.getValueType().bitsLT(ShiftVT)
2171             ? ISD::SIGN_EXTEND
2172             : ISD::TRUNCATE;
2173     N1 = (N1.getOpcode() != ISD::Constant
2174           ? DAG.getNode(N1Opc, ShiftVT, N1)
2175           : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
2176                             ShiftVT));
2177     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
2178                        DAG.getNode(Opc, MVT::i16, N0, N1));
2179   }
2180   case ISD::MUL: {
2181     SDValue N1 = Op.getOperand(1);
2182     unsigned N1Opc;
2183     N0 = (N0.getOpcode() != ISD::Constant
2184           ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
2185           : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
2186                             MVT::i16));
2187     N1Opc = N1.getValueType().bitsLT(MVT::i16) ? ISD::SIGN_EXTEND : ISD::TRUNCATE;
2188     N1 = (N1.getOpcode() != ISD::Constant
2189           ? DAG.getNode(N1Opc, MVT::i16, N1)
2190           : DAG.getConstant(cast<ConstantSDNode>(N1)->getSExtValue(),
2191                             MVT::i16));
2192     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
2193                        DAG.getNode(Opc, MVT::i16, N0, N1));
2194     break;
2195   }
2196   }
2197
2198   return SDValue();
2199 }
2200
2201 //! Generate the carry-generate shuffle mask.
2202 SDValue SPU::getCarryGenerateShufMask(SelectionDAG &DAG) {
2203   SmallVector<SDValue, 16 > ShufBytes;
2204
2205   // Create the shuffle mask for "rotating" the borrow up one register slot
2206   // once the borrow is generated.
2207   ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
2208   ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
2209   ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
2210   ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
2211
2212   return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
2213                      &ShufBytes[0], ShufBytes.size());
2214 }
2215
2216 //! Generate the borrow-generate shuffle mask
2217 SDValue SPU::getBorrowGenerateShufMask(SelectionDAG &DAG) {
2218   SmallVector<SDValue, 16 > ShufBytes;
2219
2220   // Create the shuffle mask for "rotating" the borrow up one register slot
2221   // once the borrow is generated.
2222   ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
2223   ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
2224   ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
2225   ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
2226
2227   return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
2228                      &ShufBytes[0], ShufBytes.size());
2229 }
2230
2231 //! Lower byte immediate operations for v16i8 vectors:
2232 static SDValue
2233 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
2234   SDValue ConstVec;
2235   SDValue Arg;
2236   MVT VT = Op.getValueType();
2237
2238   ConstVec = Op.getOperand(0);
2239   Arg = Op.getOperand(1);
2240   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
2241     if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
2242       ConstVec = ConstVec.getOperand(0);
2243     } else {
2244       ConstVec = Op.getOperand(1);
2245       Arg = Op.getOperand(0);
2246       if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
2247         ConstVec = ConstVec.getOperand(0);
2248       }
2249     }
2250   }
2251
2252   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
2253     uint64_t VectorBits[2];
2254     uint64_t UndefBits[2];
2255     uint64_t SplatBits, SplatUndef;
2256     int SplatSize;
2257
2258     if (!GetConstantBuildVectorBits(ConstVec.getNode(), VectorBits, UndefBits)
2259         && isConstantSplat(VectorBits, UndefBits,
2260                            VT.getVectorElementType().getSizeInBits(),
2261                            SplatBits, SplatUndef, SplatSize)) {
2262       SDValue tcVec[16];
2263       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
2264       const size_t tcVecSize = sizeof(tcVec) / sizeof(tcVec[0]);
2265
2266       // Turn the BUILD_VECTOR into a set of target constants:
2267       for (size_t i = 0; i < tcVecSize; ++i)
2268         tcVec[i] = tc;
2269
2270       return DAG.getNode(Op.getNode()->getOpcode(), VT, Arg,
2271                          DAG.getNode(ISD::BUILD_VECTOR, VT, tcVec, tcVecSize));
2272     }
2273   }
2274   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
2275   // lowered.  Return the operation, rather than a null SDValue.
2276   return Op;
2277 }
2278
2279 //! Custom lowering for CTPOP (count population)
2280 /*!
2281   Custom lowering code that counts the number ones in the input
2282   operand. SPU has such an instruction, but it counts the number of
2283   ones per byte, which then have to be accumulated.
2284 */
2285 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
2286   MVT VT = Op.getValueType();
2287   MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
2288
2289   switch (VT.getSimpleVT()) {
2290   default:
2291     assert(false && "Invalid value type!");
2292   case MVT::i8: {
2293     SDValue N = Op.getOperand(0);
2294     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2295
2296     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, vecVT, N, N);
2297     SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
2298
2299     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i8, CNTB, Elt0);
2300   }
2301
2302   case MVT::i16: {
2303     MachineFunction &MF = DAG.getMachineFunction();
2304     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2305
2306     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
2307
2308     SDValue N = Op.getOperand(0);
2309     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
2310     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
2311     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
2312
2313     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, vecVT, N, N);
2314     SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
2315
2316     // CNTB_result becomes the chain to which all of the virtual registers
2317     // CNTB_reg, SUM1_reg become associated:
2318     SDValue CNTB_result =
2319       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, CNTB, Elt0);
2320
2321     SDValue CNTB_rescopy =
2322       DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);
2323
2324     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i16);
2325
2326     return DAG.getNode(ISD::AND, MVT::i16,
2327                        DAG.getNode(ISD::ADD, MVT::i16,
2328                                    DAG.getNode(ISD::SRL, MVT::i16,
2329                                                Tmp1, Shift1),
2330                                    Tmp1),
2331                        Mask0);
2332   }
2333
2334   case MVT::i32: {
2335     MachineFunction &MF = DAG.getMachineFunction();
2336     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2337
2338     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2339     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2340
2341     SDValue N = Op.getOperand(0);
2342     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2343     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
2344     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
2345     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
2346
2347     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, vecVT, N, N);
2348     SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
2349
2350     // CNTB_result becomes the chain to which all of the virtual registers
2351     // CNTB_reg, SUM1_reg become associated:
2352     SDValue CNTB_result =
2353       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, CNTB, Elt0);
2354
2355     SDValue CNTB_rescopy =
2356       DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);
2357
2358     SDValue Comp1 =
2359       DAG.getNode(ISD::SRL, MVT::i32,
2360                   DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32), Shift1);
2361
2362     SDValue Sum1 =
2363       DAG.getNode(ISD::ADD, MVT::i32,
2364                   Comp1, DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32));
2365
2366     SDValue Sum1_rescopy =
2367       DAG.getCopyToReg(CNTB_result, SUM1_reg, Sum1);
2368
2369     SDValue Comp2 =
2370       DAG.getNode(ISD::SRL, MVT::i32,
2371                   DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32),
2372                   Shift2);
2373     SDValue Sum2 =
2374       DAG.getNode(ISD::ADD, MVT::i32, Comp2,
2375                   DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32));
2376
2377     return DAG.getNode(ISD::AND, MVT::i32, Sum2, Mask0);
2378   }
2379
2380   case MVT::i64:
2381     break;
2382   }
2383
2384   return SDValue();
2385 }
2386
2387 //! Lower ISD::FABS
2388 /*!
2389  DAGCombine does the same basic reduction: convert the double to i64 and mask
2390  off the sign bit. Unfortunately, DAGCombine inserts the i64 constant, which
2391  CellSPU has to legalize. Hence, the custom lowering.
2392  */
2393
2394 static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
2395   MVT OpVT = Op.getValueType();
2396   MVT IntVT(MVT::i64);
2397   SDValue Op0 = Op.getOperand(0);
2398
2399   assert(OpVT == MVT::f64 && "LowerFABS: expecting MVT::f64!\n");
2400
2401   SDValue iABS =
2402           DAG.getNode(ISD::AND, IntVT,
2403                       DAG.getNode(ISD::BIT_CONVERT, IntVT, Op0),
2404                       DAG.getConstant(~IntVT.getIntegerVTSignBit(), IntVT));
2405
2406   return DAG.getNode(ISD::BIT_CONVERT, MVT::f64, iABS);
2407 }
2408
2409 //! Lower ISD::SETCC
2410 /*!
2411  This handles MVT::f64 (double floating point) condition lowering
2412  */
2413
2414 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
2415                           const TargetLowering &TLI) {
2416   SDValue lhs = Op.getOperand(0);
2417   SDValue rhs = Op.getOperand(1);
2418   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode > (Op.getOperand(2));
2419   MVT lhsVT = lhs.getValueType();
2420   SDValue posNaN = DAG.getConstant(0x7ff0000000000001ULL, MVT::i64);
2421
2422   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
2423   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
2424
2425   switch (CC->get()) {
2426   case ISD::SETOEQ:
2427   case ISD::SETOGT:
2428   case ISD::SETOGE:
2429   case ISD::SETOLT:
2430   case ISD::SETOLE:
2431   case ISD::SETONE:
2432     cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
2433     abort();
2434     break;
2435   case ISD::SETO: {
2436     SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
2437     SDValue i64lhs =
2438             DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
2439
2440     return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETLT);
2441   }
2442   case ISD::SETUO: {
2443     SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
2444     SDValue i64lhs =
2445             DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
2446
2447     return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETGE);
2448   }
2449   case ISD::SETUEQ:
2450   case ISD::SETUGT:
2451   case ISD::SETUGE:
2452   case ISD::SETULT:
2453   case ISD::SETULE:
2454   case ISD::SETUNE:
2455   default:
2456     cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
2457     abort();
2458     break;
2459   }
2460
2461   return SDValue();
2462 }
2463
2464 //! Lower ISD::SELECT_CC
2465 /*!
2466   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
2467   SELB instruction.
2468
2469   \note Need to revisit this in the future: if the code path through the true
2470   and false value computations is longer than the latency of a branch (6
2471   cycles), then it would be more advantageous to branch and insert a new basic
2472   block and branch on the condition. However, this code does not make that
2473   assumption, given the simplisitc uses so far.
2474  */
2475
2476 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
2477                               const TargetLowering &TLI) {
2478   MVT VT = Op.getValueType();
2479   SDValue lhs = Op.getOperand(0);
2480   SDValue rhs = Op.getOperand(1);
2481   SDValue trueval = Op.getOperand(2);
2482   SDValue falseval = Op.getOperand(3);
2483   SDValue condition = Op.getOperand(4);
2484
2485   // NOTE: SELB's arguments: $rA, $rB, $mask
2486   //
2487   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
2488   // where bits in $mask are 1. CCond will be inverted, having 1s where the
2489   // condition was true and 0s where the condition was false. Hence, the
2490   // arguments to SELB get reversed.
2491
2492   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
2493   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
2494   // with another "cannot select select_cc" assert:
2495
2496   SDValue compare = DAG.getNode(ISD::SETCC,
2497                                 TLI.getSetCCResultType(Op.getValueType()),
2498                                 lhs, rhs, condition);
2499   return DAG.getNode(SPUISD::SELB, VT, falseval, trueval, compare);
2500 }
2501
2502 //! Custom lower ISD::TRUNCATE
2503 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
2504 {
2505   MVT VT = Op.getValueType();
2506   MVT::SimpleValueType simpleVT = VT.getSimpleVT();
2507   MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
2508
2509   SDValue Op0 = Op.getOperand(0);
2510   MVT Op0VT = Op0.getValueType();
2511   MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
2512
2513   if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
2514     // Create shuffle mask, least significant doubleword of quadword
2515     unsigned maskHigh = 0x08090a0b;
2516     unsigned maskLow = 0x0c0d0e0f;
2517     // Use a shuffle to perform the truncation
2518     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
2519                                    DAG.getConstant(maskHigh, MVT::i32),
2520                                    DAG.getConstant(maskLow, MVT::i32),
2521                                    DAG.getConstant(maskHigh, MVT::i32),
2522                                    DAG.getConstant(maskLow, MVT::i32));
2523
2524
2525     SDValue PromoteScalar = DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0);
2526
2527     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT,
2528                                        PromoteScalar, PromoteScalar, shufMask);
2529
2530     return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
2531                        DAG.getNode(ISD::BIT_CONVERT, VecVT, truncShuffle));
2532   }
2533
2534   return SDValue();             // Leave the truncate unmolested
2535 }
2536
2537 //! Custom (target-specific) lowering entry point
2538 /*!
2539   This is where LLVM's DAG selection process calls to do target-specific
2540   lowering of nodes.
2541  */
2542 SDValue
2543 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
2544 {
2545   unsigned Opc = (unsigned) Op.getOpcode();
2546   MVT VT = Op.getValueType();
2547
2548   switch (Opc) {
2549   default: {
2550     cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
2551     cerr << "Op.getOpcode() = " << Opc << "\n";
2552     cerr << "*Op.getNode():\n";
2553     Op.getNode()->dump();
2554     abort();
2555   }
2556   case ISD::LOAD:
2557   case ISD::EXTLOAD:
2558   case ISD::SEXTLOAD:
2559   case ISD::ZEXTLOAD:
2560     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
2561   case ISD::STORE:
2562     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
2563   case ISD::ConstantPool:
2564     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
2565   case ISD::GlobalAddress:
2566     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
2567   case ISD::JumpTable:
2568     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
2569   case ISD::Constant:
2570     return LowerConstant(Op, DAG);
2571   case ISD::ConstantFP:
2572     return LowerConstantFP(Op, DAG);
2573   case ISD::FORMAL_ARGUMENTS:
2574     return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
2575   case ISD::CALL:
2576     return LowerCALL(Op, DAG, SPUTM.getSubtargetImpl());
2577   case ISD::RET:
2578     return LowerRET(Op, DAG, getTargetMachine());
2579
2580   // i8, i64 math ops:
2581   case ISD::ADD:
2582   case ISD::SUB:
2583   case ISD::ROTR:
2584   case ISD::ROTL:
2585   case ISD::SRL:
2586   case ISD::SHL:
2587   case ISD::SRA: {
2588     if (VT == MVT::i8)
2589       return LowerI8Math(Op, DAG, Opc, *this);
2590     break;
2591   }
2592
2593   case ISD::FABS:
2594     return LowerFABS(Op, DAG);
2595
2596   // Vector-related lowering.
2597   case ISD::BUILD_VECTOR:
2598     return SPU::LowerBUILD_VECTOR(Op, DAG);
2599   case ISD::SCALAR_TO_VECTOR:
2600     return LowerSCALAR_TO_VECTOR(Op, DAG);
2601   case ISD::VECTOR_SHUFFLE:
2602     return LowerVECTOR_SHUFFLE(Op, DAG);
2603   case ISD::EXTRACT_VECTOR_ELT:
2604     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2605   case ISD::INSERT_VECTOR_ELT:
2606     return LowerINSERT_VECTOR_ELT(Op, DAG);
2607
2608   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
2609   case ISD::AND:
2610   case ISD::OR:
2611   case ISD::XOR:
2612     return LowerByteImmed(Op, DAG);
2613
2614   // Vector and i8 multiply:
2615   case ISD::MUL:
2616     if (VT == MVT::i8)
2617       return LowerI8Math(Op, DAG, Opc, *this);
2618
2619   case ISD::CTPOP:
2620     return LowerCTPOP(Op, DAG);
2621
2622   case ISD::SELECT_CC:
2623     return LowerSELECT_CC(Op, DAG, *this);
2624
2625   case ISD::SETCC:
2626     return LowerSETCC(Op, DAG, *this);
2627
2628   case ISD::TRUNCATE:
2629     return LowerTRUNCATE(Op, DAG);
2630   }
2631
2632   return SDValue();
2633 }
2634
2635 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
2636                                            SmallVectorImpl<SDValue>&Results,
2637                                            SelectionDAG &DAG)
2638 {
2639 #if 0
2640   unsigned Opc = (unsigned) N->getOpcode();
2641   MVT OpVT = N->getValueType(0);
2642
2643   switch (Opc) {
2644   default: {
2645     cerr << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
2646     cerr << "Op.getOpcode() = " << Opc << "\n";
2647     cerr << "*Op.getNode():\n";
2648     N->dump();
2649     abort();
2650     /*NOTREACHED*/
2651   }
2652   }
2653 #endif
2654
2655   /* Otherwise, return unchanged */
2656 }
2657
2658 //===----------------------------------------------------------------------===//
2659 // Target Optimization Hooks
2660 //===----------------------------------------------------------------------===//
2661
2662 SDValue
2663 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
2664 {
2665 #if 0
2666   TargetMachine &TM = getTargetMachine();
2667 #endif
2668   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
2669   SelectionDAG &DAG = DCI.DAG;
2670   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
2671   MVT NodeVT = N->getValueType(0);      // The node's value type
2672   MVT Op0VT = Op0.getValueType();       // The first operand's result
2673   SDValue Result;                       // Initially, empty result
2674
2675   switch (N->getOpcode()) {
2676   default: break;
2677   case ISD::ADD: {
2678     SDValue Op1 = N->getOperand(1);
2679
2680     if (Op0.getOpcode() == SPUISD::IndirectAddr
2681         || Op1.getOpcode() == SPUISD::IndirectAddr) {
2682       // Normalize the operands to reduce repeated code
2683       SDValue IndirectArg = Op0, AddArg = Op1;
2684
2685       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
2686         IndirectArg = Op1;
2687         AddArg = Op0;
2688       }
2689
2690       if (isa<ConstantSDNode>(AddArg)) {
2691         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
2692         SDValue IndOp1 = IndirectArg.getOperand(1);
2693
2694         if (CN0->isNullValue()) {
2695           // (add (SPUindirect <arg>, <arg>), 0) ->
2696           // (SPUindirect <arg>, <arg>)
2697
2698 #if !defined(NDEBUG)
2699           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2700             cerr << "\n"
2701                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
2702                  << "With:    (SPUindirect <arg>, <arg>)\n";
2703           }
2704 #endif
2705
2706           return IndirectArg;
2707         } else if (isa<ConstantSDNode>(IndOp1)) {
2708           // (add (SPUindirect <arg>, <const>), <const>) ->
2709           // (SPUindirect <arg>, <const + const>)
2710           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
2711           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
2712           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
2713
2714 #if !defined(NDEBUG)
2715           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2716             cerr << "\n"
2717                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
2718                  << "), " << CN0->getSExtValue() << ")\n"
2719                  << "With:    (SPUindirect <arg>, "
2720                  << combinedConst << ")\n";
2721           }
2722 #endif
2723
2724           return DAG.getNode(SPUISD::IndirectAddr, Op0VT,
2725                              IndirectArg, combinedValue);
2726         }
2727       }
2728     }
2729     break;
2730   }
2731   case ISD::SIGN_EXTEND:
2732   case ISD::ZERO_EXTEND:
2733   case ISD::ANY_EXTEND: {
2734     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
2735       // (any_extend (SPUextract_elt0 <arg>)) ->
2736       // (SPUextract_elt0 <arg>)
2737       // Types must match, however...
2738 #if !defined(NDEBUG)
2739       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2740         cerr << "\nReplace: ";
2741         N->dump(&DAG);
2742         cerr << "\nWith:    ";
2743         Op0.getNode()->dump(&DAG);
2744         cerr << "\n";
2745       }
2746 #endif
2747
2748       return Op0;
2749     }
2750     break;
2751   }
2752   case SPUISD::IndirectAddr: {
2753     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
2754       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
2755       if (CN != 0 && CN->getZExtValue() == 0) {
2756         // (SPUindirect (SPUaform <addr>, 0), 0) ->
2757         // (SPUaform <addr>, 0)
2758
2759         DEBUG(cerr << "Replace: ");
2760         DEBUG(N->dump(&DAG));
2761         DEBUG(cerr << "\nWith:    ");
2762         DEBUG(Op0.getNode()->dump(&DAG));
2763         DEBUG(cerr << "\n");
2764
2765         return Op0;
2766       }
2767     } else if (Op0.getOpcode() == ISD::ADD) {
2768       SDValue Op1 = N->getOperand(1);
2769       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
2770         // (SPUindirect (add <arg>, <arg>), 0) ->
2771         // (SPUindirect <arg>, <arg>)
2772         if (CN1->isNullValue()) {
2773
2774 #if !defined(NDEBUG)
2775           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2776             cerr << "\n"
2777                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
2778                  << "With:    (SPUindirect <arg>, <arg>)\n";
2779           }
2780 #endif
2781
2782           return DAG.getNode(SPUISD::IndirectAddr, Op0VT,
2783                              Op0.getOperand(0), Op0.getOperand(1));
2784         }
2785       }
2786     }
2787     break;
2788   }
2789   case SPUISD::SHLQUAD_L_BITS:
2790   case SPUISD::SHLQUAD_L_BYTES:
2791   case SPUISD::VEC_SHL:
2792   case SPUISD::VEC_SRL:
2793   case SPUISD::VEC_SRA:
2794   case SPUISD::ROTBYTES_LEFT: {
2795     SDValue Op1 = N->getOperand(1);
2796
2797     // Kill degenerate vector shifts:
2798     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
2799       if (CN->isNullValue()) {
2800         Result = Op0;
2801       }
2802     }
2803     break;
2804   }
2805   case SPUISD::PREFSLOT2VEC: {
2806     switch (Op0.getOpcode()) {
2807     default:
2808       break;
2809     case ISD::ANY_EXTEND:
2810     case ISD::ZERO_EXTEND:
2811     case ISD::SIGN_EXTEND: {
2812       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
2813       // <arg>
2814       // but only if the SPUprefslot2vec and <arg> types match.
2815       SDValue Op00 = Op0.getOperand(0);
2816       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
2817         SDValue Op000 = Op00.getOperand(0);
2818         if (Op000.getValueType() == NodeVT) {
2819           Result = Op000;
2820         }
2821       }
2822       break;
2823     }
2824     case SPUISD::VEC2PREFSLOT: {
2825       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
2826       // <arg>
2827       Result = Op0.getOperand(0);
2828       break;
2829     }
2830     }
2831     break;
2832   }
2833   }
2834
2835   // Otherwise, return unchanged.
2836 #ifndef NDEBUG
2837   if (Result.getNode()) {
2838     DEBUG(cerr << "\nReplace.SPU: ");
2839     DEBUG(N->dump(&DAG));
2840     DEBUG(cerr << "\nWith:        ");
2841     DEBUG(Result.getNode()->dump(&DAG));
2842     DEBUG(cerr << "\n");
2843   }
2844 #endif
2845
2846   return Result;
2847 }
2848
2849 //===----------------------------------------------------------------------===//
2850 // Inline Assembly Support
2851 //===----------------------------------------------------------------------===//
2852
2853 /// getConstraintType - Given a constraint letter, return the type of
2854 /// constraint it is for this target.
2855 SPUTargetLowering::ConstraintType
2856 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
2857   if (ConstraintLetter.size() == 1) {
2858     switch (ConstraintLetter[0]) {
2859     default: break;
2860     case 'b':
2861     case 'r':
2862     case 'f':
2863     case 'v':
2864     case 'y':
2865       return C_RegisterClass;
2866     }
2867   }
2868   return TargetLowering::getConstraintType(ConstraintLetter);
2869 }
2870
2871 std::pair<unsigned, const TargetRegisterClass*>
2872 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
2873                                                 MVT VT) const
2874 {
2875   if (Constraint.size() == 1) {
2876     // GCC RS6000 Constraint Letters
2877     switch (Constraint[0]) {
2878     case 'b':   // R1-R31
2879     case 'r':   // R0-R31
2880       if (VT == MVT::i64)
2881         return std::make_pair(0U, SPU::R64CRegisterClass);
2882       return std::make_pair(0U, SPU::R32CRegisterClass);
2883     case 'f':
2884       if (VT == MVT::f32)
2885         return std::make_pair(0U, SPU::R32FPRegisterClass);
2886       else if (VT == MVT::f64)
2887         return std::make_pair(0U, SPU::R64FPRegisterClass);
2888       break;
2889     case 'v':
2890       return std::make_pair(0U, SPU::GPRCRegisterClass);
2891     }
2892   }
2893
2894   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
2895 }
2896
2897 //! Compute used/known bits for a SPU operand
2898 void
2899 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
2900                                                   const APInt &Mask,
2901                                                   APInt &KnownZero,
2902                                                   APInt &KnownOne,
2903                                                   const SelectionDAG &DAG,
2904                                                   unsigned Depth ) const {
2905 #if 0
2906   const uint64_t uint64_sizebits = sizeof(uint64_t) * 8;
2907
2908   switch (Op.getOpcode()) {
2909   default:
2910     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
2911     break;
2912   case CALL:
2913   case SHUFB:
2914   case SHUFFLE_MASK:
2915   case CNTB:
2916   case SPUISD::PREFSLOT2VEC:
2917   case SPUISD::LDRESULT:
2918   case SPUISD::VEC2PREFSLOT:
2919   case SPUISD::SHLQUAD_L_BITS:
2920   case SPUISD::SHLQUAD_L_BYTES:
2921   case SPUISD::VEC_SHL:
2922   case SPUISD::VEC_SRL:
2923   case SPUISD::VEC_SRA:
2924   case SPUISD::VEC_ROTL:
2925   case SPUISD::VEC_ROTR:
2926   case SPUISD::ROTBYTES_LEFT:
2927   case SPUISD::SELECT_MASK:
2928   case SPUISD::SELB:
2929   }
2930 #endif
2931 }
2932
2933 unsigned
2934 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
2935                                                    unsigned Depth) const {
2936   switch (Op.getOpcode()) {
2937   default:
2938     return 1;
2939
2940   case ISD::SETCC: {
2941     MVT VT = Op.getValueType();
2942
2943     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
2944       VT = MVT::i32;
2945     }
2946     return VT.getSizeInBits();
2947   }
2948   }
2949 }
2950
2951 // LowerAsmOperandForConstraint
2952 void
2953 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
2954                                                 char ConstraintLetter,
2955                                                 bool hasMemory,
2956                                                 std::vector<SDValue> &Ops,
2957                                                 SelectionDAG &DAG) const {
2958   // Default, for the time being, to the base class handler
2959   TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory,
2960                                                Ops, DAG);
2961 }
2962
2963 /// isLegalAddressImmediate - Return true if the integer value can be used
2964 /// as the offset of the target addressing mode.
2965 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
2966                                                 const Type *Ty) const {
2967   // SPU's addresses are 256K:
2968   return (V > -(1 << 18) && V < (1 << 18) - 1);
2969 }
2970
2971 bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
2972   return false;
2973 }
2974
2975 bool
2976 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
2977   // The SPU target isn't yet aware of offsets.
2978   return false;
2979 }