* Rewrote casts

author Misha Brukman <brukman+llvm@gmail.com>

Mon, 26 Jul 2004 18:13:24 +0000 (18:13 +0000)

committer Misha Brukman <brukman+llvm@gmail.com>

Mon, 26 Jul 2004 18:13:24 +0000 (18:13 +0000)
author Misha Brukman <brukman+llvm@gmail.com>
Mon, 26 Jul 2004 18:13:24 +0000 (18:13 +0000)
committer Misha Brukman <brukman+llvm@gmail.com>
Mon, 26 Jul 2004 18:13:24 +0000 (18:13 +0000)
diff --git a/lib/Target/PowerPC/PPC32ISelSimple.cpp b/lib/Target/PowerPC/PPC32ISelSimple.cpp

index c838bdaa2f5fa34f17bb9dbdb4bc7db6574c06cc..ae7cf3ed40c6b803e4aaeed725fa8042d7fea1e9 100644 (file)
--- a/lib/Target/PowerPC/PPC32ISelSimple.cpp
+++ b/lib/Target/PowerPC/PPC32ISelSimple.cpp
@@ -32,8 +32,7 @@
  using namespace llvm;
  
  namespace {
-  Statistic<> GEPConsts("ppc-codegen", "Number of const GEPs");
-  Statistic<> GEPSplits("ppc-codegen", "Number of partially const GEPs");
+  Statistic<> GEPFolds("ppc-codegen", "Number of GEPs folded");
  
    /// TypeClass - Used by the PowerPC backend to group LLVM types by their basic
    /// PPC Representation.
@@ -41,17 +40,6 @@ namespace {
    enum TypeClass {
      cByte, cShort, cInt, cFP32, cFP64, cLong
    };
-
-  // This struct is for recording the necessary operations to emit the GEP
-  typedef struct CollapsedGepOp {
-  public:
-    CollapsedGepOp(bool mul, Value *i, ConstantSInt *s) : 
-      isMul(mul), index(i), size(s) {}
-  
-    bool isMul;
-    Value *index;
-    ConstantSInt *size;
-  } CollapsedGepOp;
  }
  
  /// getClass - Turn a primitive type into a "class" number which is based on the
@@ -90,7 +78,7 @@ namespace {
      MachineFunction *F;                 // The function we are compiling into
      MachineBasicBlock *BB;              // The current MBB we are compiling
      int VarArgsFrameIndex;              // FrameIndex for start of varargs area
-
+    
      std::map<Value*, unsigned> RegMap;  // Mapping between Values and SSA Regs
  
      // External functions used in the Module
@@ -105,6 +93,11 @@ namespace {
      // FrameIndex for the alloca.
      std::map<AllocaInst*, unsigned> AllocaMap;
  
+    // A Reg to hold the base address used for global loads and stores, and a
+    // flag to set whether or not we need to emit it for this function.
+    unsigned GlobalBaseReg;
+    bool GlobalBaseInitialized;
+    
      ISel(TargetMachine &tm) : TM(reinterpret_cast<PowerPCTargetMachine&>(tm)), 
        F(0), BB(0) {}
  
@@ -158,6 +151,9 @@ namespace {
  
        BB = &F->front();
  
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseInitialized = false;
+
        // Copy incoming arguments off of the stack...
        LoadArgumentsToVirtualRegs(Fn);
  
@@ -219,6 +215,16 @@ namespace {
        ValueRecord(unsigned R, const Type *T) : Val(0), Reg(R), Ty(T) {}
        ValueRecord(Value *V) : Val(V), Reg(0), Ty(V->getType()) {}
      };
+    
+    // This struct is for recording the necessary operations to emit the GEP
+    struct CollapsedGepOp {
+      bool isMul;
+      Value *index;
+      ConstantSInt *size;
+      CollapsedGepOp(bool mul, Value *i, ConstantSInt *s) :
+        isMul(mul), index(i), size(s) {}
+    };
+
      void doCall(const ValueRecord &Ret, MachineInstr *CallMI,
                  const std::vector<ValueRecord> &Args, bool isVarArg);
      void visitCallInst(CallInst &I);
@@ -276,7 +282,8 @@ namespace {
      ///
      void emitGEPOperation(MachineBasicBlock *BB, MachineBasicBlock::iterator IP,
                            Value *Src, User::op_iterator IdxBegin,
-                          User::op_iterator IdxEnd, unsigned TargetReg);
+                          User::op_iterator IdxEnd, unsigned TargetReg,
+                          bool CollapseRemainder, ConstantSInt **Remainder);
  
      /// emitCastOperation - Common code shared between visitCastInst and
      /// constant expression cast support.
@@ -335,11 +342,19 @@ namespace {
        
      /// emitSelectOperation - Common code shared between visitSelectInst and the
      /// constant expression support.
+    ///
      void emitSelectOperation(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator IP,
                               Value *Cond, Value *TrueVal, Value *FalseVal,
                               unsigned DestReg);
  
+    /// copyGlobalBaseToRegister - Output the instructions required to put the
+    /// base address to use for accessing globals into a register.
+    ///
+    void ISel::copyGlobalBaseToRegister(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator IP,
+                                        unsigned R);
+
      /// copyConstantToRegister - Output the instructions required to put the
      /// specified constant into the specified register.
      ///
@@ -417,10 +432,6 @@ unsigned ISel::getReg(Value *V, MachineBasicBlock *MBB,
      unsigned Reg = makeAnotherReg(V->getType());
      copyConstantToRegister(MBB, IPt, C, Reg);
      return Reg;
-  } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
-    // Do not emit noop casts at all.
-    if (getClassB(CI->getType()) == getClassB(CI->getOperand(0)->getType()))
-      return getReg(CI->getOperand(0), MBB, IPt);
    } else if (AllocaInst *AI = dyn_castFixedAlloca(V)) {
      unsigned Reg = makeAnotherReg(V->getType());
      unsigned FI = getFixedSizedAllocaFI(AI);
@@ -500,6 +511,26 @@ unsigned ISel::getFixedSizedAllocaFI(AllocaInst *AI) {
  }
  
  
+/// copyGlobalBaseToRegister - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+void ISel::copyGlobalBaseToRegister(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator IP,
+                                    unsigned R) {
+  if (!GlobalBaseInitialized) {
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = F->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    GlobalBaseReg = makeAnotherReg(Type::IntTy);
+    BuildMI(FirstMBB, MBBI, PPC32::MovePCtoLR, 0, GlobalBaseReg);
+    GlobalBaseInitialized = true;
+  }
+  // Emit our copy of GlobalBaseReg to the destination register in the
+  // current MBB
+  BuildMI(*MBB, IP, PPC32::OR, 2, R).addReg(GlobalBaseReg)
+    .addReg(GlobalBaseReg);
+}
+
  /// copyConstantToRegister - Output the instructions required to put the
  /// specified constant into the specified register.
  ///
@@ -567,14 +598,13 @@ void ISel::copyConstantToRegister(MachineBasicBlock *MBB,
  
      assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
  
-    // Load addr of constant to reg; constant is located at PC + distance
-    unsigned CurPC = makeAnotherReg(Type::IntTy);
+    // Load addr of constant to reg; constant is located at base + distance
+    unsigned GlobalBase = makeAnotherReg(Type::IntTy);
      unsigned Reg1 = makeAnotherReg(Type::IntTy);
      unsigned Reg2 = makeAnotherReg(Type::IntTy);
-    // Move PC to destination reg
-    BuildMI(*MBB, IP, PPC32::MovePCtoLR, 0, CurPC);
-    // Move value at PC + distance into return reg
-    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, Reg1).addReg(CurPC)
+    // Move value at base + distance into return reg
+    copyGlobalBaseToRegister(MBB, IP, GlobalBase);
+    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, Reg1).addReg(GlobalBase)
        .addConstantPoolIndex(CPI);
      BuildMI(*MBB, IP, PPC32::LOADLoDirect, 2, Reg2).addReg(Reg1)
        .addConstantPoolIndex(CPI);
@@ -585,16 +615,15 @@ void ISel::copyConstantToRegister(MachineBasicBlock *MBB,
      // Copy zero (null pointer) to the register.
      BuildMI(*MBB, IP, PPC32::LI, 1, R).addSImm(0);
    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
-    // GV is located at PC + distance
-    unsigned CurPC = makeAnotherReg(Type::IntTy);
+    // GV is located at base + distance
+    unsigned GlobalBase = makeAnotherReg(Type::IntTy);
      unsigned TmpReg = makeAnotherReg(GV->getType());
      unsigned Opcode = (GV->hasWeakLinkage() || GV->isExternal()) ? 
        PPC32::LOADLoIndirect : PPC32::LOADLoDirect;
-      
-    // Move PC to destination reg
-    BuildMI(*MBB, IP, PPC32::MovePCtoLR, 0, CurPC);
-    // Move value at PC + distance into return reg
-    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, TmpReg).addReg(CurPC)
+    
+    // Move value at base + distance into return reg
+    copyGlobalBaseToRegister(MBB, IP, GlobalBase);
+    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, TmpReg).addReg(GlobalBase)
        .addGlobalAddress(GV);
      BuildMI(*MBB, IP, Opcode, 2, R).addReg(TmpReg).addGlobalAddress(GV);
    
@@ -740,7 +769,7 @@ void ISel::LoadArgumentsToVirtualRegs(Function &Fn) {
    // the start of the first vararg value... this is used to expand
    // llvm.va_start.
    if (Fn.getFunctionType()->isVarArg())
-    VarArgsFrameIndex = MFI->CreateFixedObject(1, ArgOffset);
+    VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset);
  }
  
  
@@ -861,6 +890,32 @@ static SetCondInst *canFoldSetCCIntoBranchOrSelect(Value *V) {
    return 0;
  }
  
+
+// canFoldGEPIntoLoadOrStore - Return the GEP instruction if we can fold it into
+// the load or store instruction that is the only user of the GEP.
+//
+static GetElementPtrInst *canFoldGEPIntoLoadOrStore(Value *V) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V))
+    if (GEPI->hasOneUse()) {
+      Instruction *User = cast<Instruction>(GEPI->use_back());
+      if (isa<StoreInst>(User) &&
+          GEPI->getParent() == User->getParent() &&
+          User->getOperand(0) != GEPI &&
+          User->getOperand(1) == GEPI) {
+        ++GEPFolds;
+        return GEPI;
+      }
+      if (isa<LoadInst>(User) &&
+          GEPI->getParent() == User->getParent() &&
+          User->getOperand(0) == GEPI) {
+        ++GEPFolds;
+        return GEPI;
+      }
+    }
+  return 0;
+}
+
+
  // Return a fixed numbering for setcc instructions which does not depend on the
  // order of the opcodes.
  //
@@ -917,6 +972,26 @@ unsigned ISel::EmitComparison(unsigned OpNum, Value *Op0, Value *Op1,
    unsigned Class = getClassB(CompTy);
    unsigned Op0r = getReg(Op0, MBB, IP);
  
+  // Before we do a comparison, we have to make sure that we're truncating our
+  // registers appropriately.
+  if (Class == cByte) {
+    unsigned TmpReg = makeAnotherReg(CompTy);
+    if (CompTy->isSigned())
+      BuildMI(*MBB, IP, PPC32::EXTSB, 1, TmpReg).addReg(Op0r);
+    else
+      BuildMI(*MBB, IP, PPC32::RLWINM, 4, TmpReg).addReg(Op0r).addImm(0)
+        .addImm(24).addImm(31);
+    Op0r = TmpReg;
+  } else if (Class == cShort) {
+    unsigned TmpReg = makeAnotherReg(CompTy);
+    if (CompTy->isSigned())
+      BuildMI(*MBB, IP, PPC32::EXTSH, 1, TmpReg).addReg(Op0r);
+    else
+      BuildMI(*MBB, IP, PPC32::RLWINM, 4, TmpReg).addReg(Op0r).addImm(0)
+        .addImm(16).addImm(31);
+    Op0r = TmpReg;
+  }
+  
    // Use crand for lt, gt and crandc for le, ge
    unsigned CROpcode = (OpNum == 2 || OpNum == 4) ? PPC32::CRAND : PPC32::CRANDC;
    // ? cr1[lt] : cr1[gt]
@@ -1387,7 +1462,8 @@ void ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
            BuildMI(BB, PPC32::OR, 2, GPR[GPR_idx]).addReg(ArgReg)
              .addReg(ArgReg);
            CallMI->addRegOperand(GPR[GPR_idx], MachineOperand::Use);
-        } else {
+        }
+        if (GPR_remaining <= 0 || isVarArg) {
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg).addSImm(ArgOffset)
              .addReg(PPC32::R1);
          }
@@ -1400,7 +1476,8 @@ void ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
            BuildMI(BB, PPC32::OR, 2, GPR[GPR_idx]).addReg(ArgReg)
              .addReg(ArgReg);
            CallMI->addRegOperand(GPR[GPR_idx], MachineOperand::Use);
-        } else {
+        }
+        if (GPR_remaining <= 0 || isVarArg) {
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg).addSImm(ArgOffset)
              .addReg(PPC32::R1);
          }
@@ -1417,7 +1494,8 @@ void ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
              .addReg(ArgReg+1);
            CallMI->addRegOperand(GPR[GPR_idx], MachineOperand::Use);
            CallMI->addRegOperand(GPR[GPR_idx+1], MachineOperand::Use);
-        } else {
+        }
+        if (GPR_remaining <= 1 || isVarArg) {
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg).addSImm(ArgOffset)
              .addReg(PPC32::R1);
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg+1).addSImm(ArgOffset+4)
@@ -1599,8 +1677,10 @@ void ISel::LowerUnknownIntrinsicFunctionCalls(Function &F) {
            case Intrinsic::vaend:
            case Intrinsic::returnaddress:
            case Intrinsic::frameaddress:
-            // FIXME: should lower this ourselves
+            // FIXME: should lower these ourselves
              // case Intrinsic::isunordered:
+            // case Intrinsic::memcpy: -> doCall().  system memcpy almost
+            // guaranteed to be faster than anything we generate ourselves
              // We directly implement these intrinsics
              break;
            case Intrinsic::readio: {
@@ -1674,7 +1754,7 @@ void ISel::visitIntrinsicCall(Intrinsic::ID ID, CallInst &CI) {
        BuildMI(BB, PPC32::LI, 1, TmpReg1).addSImm(0);
      }
      return;
-
+    
  #if 0
      // This may be useful for supporting isunordered
    case Intrinsic::isnan:
@@ -2289,7 +2369,7 @@ void ISel::emitShiftOperation(MachineBasicBlock *MBB,
          if (isSigned) {
            // FIXME: Unimplemented
            // Page C-3 of the PowerPC 32bit Programming Environments Manual
-          std::cerr << "ERROR: Unimplemented: signed right shift\n";
+          std::cerr << "ERROR: Unimplemented: signed right shift of long\n";
            abort();
          } else {
            BuildMI(*MBB, IP, PPC32::SUBFIC, 2, TmpReg1).addReg(ShiftAmountReg)
@@ -2344,68 +2424,168 @@ void ISel::emitShiftOperation(MachineBasicBlock *MBB,
  }
  
  
-/// visitLoadInst - Implement LLVM load instructions
+/// visitLoadInst - Implement LLVM load instructions.  Pretty straightforward
+/// mapping of LLVM classes to PPC load instructions, with the exception of
+/// signed byte loads, which need a sign extension following them.
  ///
  void ISel::visitLoadInst(LoadInst &I) {
-  static const unsigned Opcodes[] = { 
-    PPC32::LBZ, PPC32::LHZ, PPC32::LWZ, PPC32::LFS 
+  // Immediate opcodes, for reg+imm addressing
+  static const unsigned ImmOpcodes[] = { 
+    PPC32::LBZ, PPC32::LHZ, PPC32::LWZ, 
+    PPC32::LFS, PPC32::LFD, PPC32::LWZ
+  };
+  // Indexed opcodes, for reg+reg addressing
+  static const unsigned IdxOpcodes[] = {
+    PPC32::LBZX, PPC32::LHZX, PPC32::LWZX,
+    PPC32::LFSX, PPC32::LFDX, PPC32::LWZX
    };
  
-  unsigned Class = getClassB(I.getType());
-  unsigned Opcode = Opcodes[Class];
-  if (I.getType() == Type::DoubleTy) Opcode = PPC32::LFD;
-  if (Class == cShort && I.getType()->isSigned()) Opcode = PPC32::LHA;
-  unsigned DestReg = getReg(I);
+  unsigned Class     = getClassB(I.getType());
+  unsigned ImmOpcode = ImmOpcodes[Class];
+  unsigned IdxOpcode = IdxOpcodes[Class];
+  unsigned DestReg   = getReg(I);
+  Value *SourceAddr  = I.getOperand(0);
+  
+  if (Class == cShort && I.getType()->isSigned()) ImmOpcode = PPC32::LHA;
+  if (Class == cShort && I.getType()->isSigned()) IdxOpcode = PPC32::LHAX;
  
-  if (AllocaInst *AI = dyn_castFixedAlloca(I.getOperand(0))) {
+  if (AllocaInst *AI = dyn_castFixedAlloca(SourceAddr)) {
      unsigned FI = getFixedSizedAllocaFI(AI);
      if (Class == cLong) {
-      addFrameReference(BuildMI(BB, PPC32::LWZ, 2, DestReg), FI);
-      addFrameReference(BuildMI(BB, PPC32::LWZ, 2, DestReg+1), FI, 4);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, DestReg), FI);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, DestReg+1), FI, 4);
      } else if (Class == cByte && I.getType()->isSigned()) {
        unsigned TmpReg = makeAnotherReg(I.getType());
-      addFrameReference(BuildMI(BB, Opcode, 2, TmpReg), FI);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, TmpReg), FI);
        BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
      } else {
-      addFrameReference(BuildMI(BB, Opcode, 2, DestReg), FI);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, DestReg), FI);
+    }
+    return;
+  }
+  
+  // If this load is the only use of the GEP instruction that is its address,
+  // then we can fold the GEP directly into the load instruction.
+  // emitGEPOperation with a second to last arg of 'true' will place the
+  // base register for the GEP into baseReg, and the constant offset from that
+  // into offset.  If the offset fits in 16 bits, we can emit a reg+imm store
+  // otherwise, we copy the offset into another reg, and use reg+reg addressing.
+  if (GetElementPtrInst *GEPI = canFoldGEPIntoLoadOrStore(SourceAddr)) {
+    unsigned baseReg = getReg(GEPI);
+    ConstantSInt *offset;
+    
+    emitGEPOperation(BB, BB->end(), GEPI->getOperand(0), GEPI->op_begin()+1, 
+                     GEPI->op_end(), baseReg, true, &offset);
+
+    if (Class != cLong && canUseAsImmediateForOpcode(offset, 0)) {
+      if (Class == cByte && I.getType()->isSigned()) {
+        unsigned TmpReg = makeAnotherReg(I.getType());
+        BuildMI(BB, ImmOpcode, 2, TmpReg).addSImm(offset->getValue())
+          .addReg(baseReg);
+        BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
+      } else {
+        BuildMI(BB, ImmOpcode, 2, DestReg).addSImm(offset->getValue())
+          .addReg(baseReg);
+      }
+      return;
      }
-  } else {
-    unsigned SrcAddrReg = getReg(I.getOperand(0));
      
+    unsigned indexReg = getReg(offset);
+
      if (Class == cLong) {
-      BuildMI(BB, PPC32::LWZ, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
-      BuildMI(BB, PPC32::LWZ, 2, DestReg+1).addSImm(4).addReg(SrcAddrReg);
+      unsigned indexPlus4 = makeAnotherReg(Type::IntTy);
+      BuildMI(BB, PPC32::ADDI, 2, indexPlus4).addReg(indexReg).addSImm(4);
+      BuildMI(BB, IdxOpcode, 2, DestReg).addReg(indexReg).addReg(baseReg);
+      BuildMI(BB, IdxOpcode, 2, DestReg+1).addReg(indexPlus4).addReg(baseReg);
      } else if (Class == cByte && I.getType()->isSigned()) {
        unsigned TmpReg = makeAnotherReg(I.getType());
-      BuildMI(BB, Opcode, 2, TmpReg).addSImm(0).addReg(SrcAddrReg);
+      BuildMI(BB, IdxOpcode, 2, DestReg).addReg(indexReg).addReg(baseReg);
        BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
      } else {
-      BuildMI(BB, Opcode, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
+      BuildMI(BB, IdxOpcode, 2, DestReg).addReg(indexReg).addReg(baseReg);
      }
+    return;
+  }
+  
+  // The fallback case, where the load was from a source that could not be
+  // folded into the load instruction. 
+  unsigned SrcAddrReg = getReg(SourceAddr);
+    
+  if (Class == cLong) {
+    BuildMI(BB, ImmOpcode, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
+    BuildMI(BB, ImmOpcode, 2, DestReg+1).addSImm(4).addReg(SrcAddrReg);
+  } else if (Class == cByte && I.getType()->isSigned()) {
+    unsigned TmpReg = makeAnotherReg(I.getType());
+    BuildMI(BB, ImmOpcode, 2, TmpReg).addSImm(0).addReg(SrcAddrReg);
+    BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
+  } else {
+    BuildMI(BB, ImmOpcode, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
    }
  }
  
  /// visitStoreInst - Implement LLVM store instructions
  ///
  void ISel::visitStoreInst(StoreInst &I) {
-  unsigned ValReg      = getReg(I.getOperand(0));
-  unsigned AddressReg  = getReg(I.getOperand(1));
- 
-  const Type *ValTy = I.getOperand(0)->getType();
-  unsigned Class = getClassB(ValTy);
+  // Immediate opcodes, for reg+imm addressing
+  static const unsigned ImmOpcodes[] = {
+    PPC32::STB, PPC32::STH, PPC32::STW, 
+    PPC32::STFS, PPC32::STFD, PPC32::STW
+  };
+  // Indexed opcodes, for reg+reg addressing
+  static const unsigned IdxOpcodes[] = {
+    PPC32::STBX, PPC32::STHX, PPC32::STWX, 
+    PPC32::STFSX, PPC32::STDX, PPC32::STWX
+  };
+  
+  Value *SourceAddr  = I.getOperand(1);
+  const Type *ValTy  = I.getOperand(0)->getType();
+  unsigned Class     = getClassB(ValTy);
+  unsigned ImmOpcode = ImmOpcodes[Class];
+  unsigned IdxOpcode = IdxOpcodes[Class];
+  unsigned ValReg    = getReg(I.getOperand(0));
+
+  // If this store is the only use of the GEP instruction that is its address,
+  // then we can fold the GEP directly into the store instruction.
+  // emitGEPOperation with a second to last arg of 'true' will place the
+  // base register for the GEP into baseReg, and the constant offset from that
+  // into offset.  If the offset fits in 16 bits, we can emit a reg+imm store
+  // otherwise, we copy the offset into another reg, and use reg+reg addressing.
+  if (GetElementPtrInst *GEPI = canFoldGEPIntoLoadOrStore(SourceAddr)) {
+    unsigned baseReg = getReg(GEPI);
+    ConstantSInt *offset;
+    
+    emitGEPOperation(BB, BB->end(), GEPI->getOperand(0), GEPI->op_begin()+1, 
+                     GEPI->op_end(), baseReg, true, &offset);
+
+    if (Class != cLong && canUseAsImmediateForOpcode(offset, 0)) {
+      BuildMI(BB, ImmOpcode, 3).addReg(ValReg).addSImm(offset->getValue())
+        .addReg(baseReg);
+      return;
+    }
+    
+    unsigned indexReg = getReg(offset);
  
+    if (Class == cLong) {
+      unsigned indexPlus4 = makeAnotherReg(Type::IntTy);
+      BuildMI(BB, PPC32::ADDI, 2, indexPlus4).addReg(indexReg).addSImm(4);
+      BuildMI(BB, IdxOpcode, 3).addReg(ValReg).addReg(indexReg).addReg(baseReg);
+      BuildMI(BB, IdxOpcode, 3).addReg(ValReg+1).addReg(indexPlus4)
+        .addReg(baseReg);
+      return;
+    }
+    BuildMI(BB, IdxOpcode, 3).addReg(ValReg).addReg(indexReg).addReg(baseReg);
+    return;
+  }
+  
+  // If the store address wasn't the only use of a GEP, we fall back to the
+  // standard path: store the ValReg at the value in AddressReg.
+  unsigned AddressReg  = getReg(I.getOperand(1));
    if (Class == cLong) {
-    BuildMI(BB, PPC32::STW, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
-    BuildMI(BB, PPC32::STW, 3).addReg(ValReg+1).addSImm(4).addReg(AddressReg);
+    BuildMI(BB, ImmOpcode, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
+    BuildMI(BB, ImmOpcode, 3).addReg(ValReg+1).addSImm(4).addReg(AddressReg);
      return;
    }
-
-  static const unsigned Opcodes[] = {
-    PPC32::STB, PPC32::STH, PPC32::STW, PPC32::STFS
-  };
-  unsigned Opcode = Opcodes[Class];
-  if (ValTy == Type::DoubleTy) Opcode = PPC32::STFD;
-  BuildMI(BB, Opcode, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
+  BuildMI(BB, ImmOpcode, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
  }
  
  
@@ -2417,10 +2597,6 @@ void ISel::visitCastInst(CastInst &CI) {
  
    unsigned SrcClass = getClassB(Op->getType());
    unsigned DestClass = getClassB(CI.getType());
-  // Noop casts are not emitted: getReg will return the source operand as the
-  // register to use for any uses of the noop cast.
-  if (DestClass == SrcClass)
-    return;
  
    // If this is a cast from a 32-bit integer to a Long type, and the only uses
    // of the case are GEP instructions, then the cast does not need to be
@@ -2484,24 +2660,6 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
      return;
    }
  
-  // Implement casts between values of the same type class (as determined by
-  // getClass) by using a register-to-register move.
-  if (SrcClass == DestClass) {
-    if (SrcClass <= cInt) {
-      BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
-    } else if (SrcClass == cFP32 || SrcClass == cFP64) {
-      BuildMI(*MBB, IP, PPC32::FMR, 1, DestReg).addReg(SrcReg);
-    } else if (SrcClass == cLong) {
-      BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
-      BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
-        .addReg(SrcReg+1);
-    } else {
-      assert(0 && "Cannot handle this type of cast instruction!");
-      abort();
-    }
-    return;
-  }
-  
    // Handle cast of Float -> Double
    if (SrcClass == cFP32 && DestClass == cFP64) {
      BuildMI(*MBB, IP, PPC32::FMR, 1, DestReg).addReg(SrcReg);
@@ -2514,52 +2672,6 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
      return;
    }
    
-  // Handle cast of SMALLER int to LARGER int using a move with sign extension
-  // or zero extension, depending on whether the source type was signed.
-  if (SrcClass <= cInt && (DestClass <= cInt || DestClass == cLong) &&
-      SrcClass < DestClass) {
-    bool isLong = DestClass == cLong;
-    if (isLong) {
-      DestClass = cInt;
-      ++DestReg;
-    }
-    
-    bool isUnsigned = DestTy->isUnsigned() || DestTy == Type::BoolTy;
-    BuildMI(*BB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
-
-    if (isLong) {  // Handle upper 32 bits as appropriate...
-      --DestReg;
-      if (isUnsigned)     // Zero out top bits...
-        BuildMI(*BB, IP, PPC32::LI, 1, DestReg).addSImm(0);
-      else                // Sign extend bottom half...
-        BuildMI(*BB, IP, PPC32::SRAWI, 2, DestReg).addReg(SrcReg).addImm(31);
-    }
-    return;
-  }
-
-  // Special case long -> int ...
-  if (SrcClass == cLong && DestClass == cInt) {
-    BuildMI(*BB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg+1).addReg(SrcReg+1);
-    return;
-  }
-  
-  // Handle cast of LARGER int to SMALLER int with a clear or sign extend
-  if ((SrcClass <= cInt || SrcClass == cLong) && DestClass <= cInt && 
-      SrcClass > DestClass) {
-    bool isUnsigned = DestTy->isUnsigned() || DestTy == Type::BoolTy;
-    unsigned source = (SrcClass == cLong) ? SrcReg+1 : SrcReg;
-    
-    if (isUnsigned) {
-      unsigned shift = (DestClass == cByte) ? 24 : 16;
-      BuildMI(*BB, IP, PPC32::RLWINM, 4, DestReg).addReg(source).addZImm(0)
-        .addImm(shift).addImm(31);
-    } else {
-      BuildMI(*BB, IP, (DestClass == cByte) ? PPC32::EXTSB : PPC32::EXTSH, 1, 
-              DestReg).addReg(source);
-    }
-    return;
-  }
-
    // Handle casts from integer to floating point now...
    if (DestClass == cFP32 || DestClass == cFP64) {
  
@@ -2624,7 +2736,7 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
        addFrameReference(BuildMI(*BB, IP, PPC32::LFD, 2, ConstF), 
                          ConstantFrameIndex);
        addFrameReference(BuildMI(*BB, IP, PPC32::LFD, 2, TempF), ValueFrameIdx);
-      BuildMI(*BB, IP, PPC32::FSUB, 2, DestReg).addReg(TempF ).addReg(ConstF);
+      BuildMI(*BB, IP, PPC32::FSUB, 2, DestReg).addReg(TempF).addReg(ConstF);
      }
      return;
    }
@@ -2647,23 +2759,26 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
        F->getFrameInfo()->CreateStackObject(SrcTy, TM.getTargetData());
  
      if (DestTy->isSigned()) {
-      unsigned LoadOp = (DestClass == cShort) ? PPC32::LHA : PPC32::LWZ;
        unsigned TempReg = makeAnotherReg(Type::DoubleTy);
        
        // Convert to integer in the FP reg and store it to a stack slot
        BuildMI(*BB, IP, PPC32::FCTIWZ, 1, TempReg).addReg(SrcReg);
        addFrameReference(BuildMI(*BB, IP, PPC32::STFD, 3)
                            .addReg(TempReg), ValueFrameIdx);
-      
-      // There is no load signed byte opcode, so we must emit a sign extend
+
+      // There is no load signed byte opcode, so we must emit a sign extend for
+      // that particular size.  Make sure to source the new integer from the 
+      // correct offset.
        if (DestClass == cByte) {
          unsigned TempReg2 = makeAnotherReg(DestTy);
-        addFrameReference(BuildMI(*BB, IP, LoadOp, 2, TempReg2), 
-                          ValueFrameIdx, 4);
+        addFrameReference(BuildMI(*BB, IP, PPC32::LBZ, 2, TempReg2), 
+                          ValueFrameIdx, 7);
          BuildMI(*MBB, IP, PPC32::EXTSB, DestReg).addReg(TempReg2);
        } else {
+        int offset = (DestClass == cShort) ? 6 : 4;
+        unsigned LoadOp = (DestClass == cShort) ? PPC32::LHA : PPC32::LWZ;
          addFrameReference(BuildMI(*BB, IP, LoadOp, 2, DestReg), 
-                          ValueFrameIdx, 4);
+                          ValueFrameIdx, offset);
        }
      } else {
        unsigned Zero = getReg(ConstantFP::get(Type::DoubleTy, 0.0f));
@@ -2710,29 +2825,228 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
        BuildMI(*BB, IP, PPC32::FCTIWZ, 1, ConvReg).addReg(TmpReg2);
        addFrameReference(BuildMI(*BB, IP, PPC32::STFD, 3).addReg(ConvReg),
                          FrameIdx);
-      addFrameReference(BuildMI(*BB, IP, PPC32::LWZ, 2, IntTmp),
-                        FrameIdx, 4);
-      BuildMI(*BB, IP, PPC32::BLT, 2).addReg(PPC32::CR0).addMBB(PhiMBB);
-      BuildMI(*BB, IP, PPC32::B, 1).addMBB(XorMBB);
-
-      // XorMBB:
-      //   add 2**31 if input was >= 2**31
-      BB = XorMBB;
-      BuildMI(BB, PPC32::XORIS, 2, XorReg).addReg(IntTmp).addImm(0x8000);
-      BuildMI(BB, PPC32::B, 1).addMBB(PhiMBB);
-      XorMBB->addSuccessor(PhiMBB);
-
-      // PhiMBB:
-      //   DestReg = phi [ IntTmp, OldMBB ], [ XorReg, XorMBB ]
-      BB = PhiMBB;
-      BuildMI(BB, PPC32::PHI, 2, DestReg).addReg(IntTmp).addMBB(OldMBB)
-        .addReg(XorReg).addMBB(XorMBB);
+      if (DestClass == cByte) {
+        addFrameReference(BuildMI(*BB, IP, PPC32::LBZ, 2, DestReg),
+                          FrameIdx, 7);
+      } else if (DestClass == cShort) {
+        addFrameReference(BuildMI(*BB, IP, PPC32::LHZ, 2, DestReg),
+                          FrameIdx, 6);
+      } if (DestClass == cInt) {
+        addFrameReference(BuildMI(*BB, IP, PPC32::LWZ, 2, IntTmp),
+                          FrameIdx, 4);
+        BuildMI(*BB, IP, PPC32::BLT, 2).addReg(PPC32::CR0).addMBB(PhiMBB);
+        BuildMI(*BB, IP, PPC32::B, 1).addMBB(XorMBB);
+
+        // XorMBB:
+        //   add 2**31 if input was >= 2**31
+        BB = XorMBB;
+        BuildMI(BB, PPC32::XORIS, 2, XorReg).addReg(IntTmp).addImm(0x8000);
+        XorMBB->addSuccessor(PhiMBB);
+
+        // PhiMBB:
+        //   DestReg = phi [ IntTmp, OldMBB ], [ XorReg, XorMBB ]
+        BB = PhiMBB;
+        BuildMI(BB, PPC32::PHI, 2, DestReg).addReg(IntTmp).addMBB(OldMBB)
+          .addReg(XorReg).addMBB(XorMBB);
+      }
+    }
+    return;
+  }
+
+  // Check our invariants
+  assert((SrcClass <= cInt || SrcClass == cLong) && 
+         "Unhandled source class for cast operation!");
+  assert((DestClass <= cInt || DestClass == cLong) && 
+         "Unhandled destination class for cast operation!");
+
+  bool sourceUnsigned = SrcTy->isUnsigned() || SrcTy == Type::BoolTy;
+  bool destUnsigned = DestTy->isUnsigned();
+
+  // Unsigned -> Unsigned, clear if larger, 
+  if (sourceUnsigned && destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
+          .addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::LI, 1, DestReg).addSImm(0);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle u{ byte, short, int } x u{ byte, short, int }
+    unsigned clearBits = (SrcClass == cByte || DestClass == cByte) ? 24 : 16;
+    switch (SrcClass) {
+    case cByte:
+    case cShort:
+      if (SrcClass == DestClass)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cInt)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      break;
+    }
+    return;
+  }
+
+  // Signed -> Signed
+  if (!sourceUnsigned && !destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
+          .addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::SRAWI, 2, DestReg).addReg(SrcReg).addImm(31);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle { byte, short, int } x { byte, short, int }
+    switch (SrcClass) {
+    case cByte:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      break;
+    case cShort:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      break;
+    }
+    return;
+  }
+
+  // Unsigned -> Signed
+  if (sourceUnsigned && !destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1).
+          addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::LI, 1, DestReg).addSImm(0);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle u{ byte, short, int } -> { byte, short, int }
+    switch (SrcClass) {
+    case cByte:
+      if (DestClass == cByte)
+        // uByte 255 -> signed byte == -1
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else
+        // uByte 255 -> signed short/int == 255
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg).addImm(0)
+          .addImm(24).addImm(31);
+      break;
+    case cShort:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg).addImm(0)
+          .addImm(16).addImm(31);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      break;
+    }
+    return;
+  }
+
+  // Signed -> Unsigned
+  if (!sourceUnsigned && destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
+          .addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::SRAWI, 2, DestReg).addReg(SrcReg).addImm(31);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle { byte, short, int } -> u{ byte, short, int }
+    unsigned clearBits = (DestClass == cByte) ? 24 : 16;
+    switch (SrcClass) {
+    case cByte:
+    case cShort:
+      if (DestClass == cByte || DestClass == cShort)
+        // sbyte -1 -> ubyte 0x000000FF
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      else
+        // sbyte -1 -> ubyte 0xFFFFFFFF
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cInt)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      break;
      }
      return;
    }
  
    // Anything we haven't handled already, we can't (yet) handle at all.
-  assert(0 && "Unhandled cast instruction!");
+  std::cerr << "Unhandled cast from " << SrcTy->getDescription()
+            << "to " << DestTy->getDescription() << '\n';
    abort();
  }
  
@@ -2783,6 +3097,9 @@ void ISel::visitVAArgInst(VAArgInst &I) {
      BuildMI(BB, PPC32::LWZ, 2, DestReg).addSImm(0).addReg(VAList);
      BuildMI(BB, PPC32::LWZ, 2, DestReg+1).addSImm(4).addReg(VAList);
      break;
+  case Type::FloatTyID:
+    BuildMI(BB, PPC32::LFS, 2, DestReg).addSImm(0).addReg(VAList);
+    break;
    case Type::DoubleTyID:
      BuildMI(BB, PPC32::LFD, 2, DestReg).addSImm(0).addReg(VAList);
      break;
@@ -2792,9 +3109,12 @@ void ISel::visitVAArgInst(VAArgInst &I) {
  /// visitGetElementPtrInst - instruction-select GEP instructions
  ///
  void ISel::visitGetElementPtrInst(GetElementPtrInst &I) {
+  if (canFoldGEPIntoLoadOrStore(&I))
+    return;
+
    unsigned outputReg = getReg(I);
    emitGEPOperation(BB, BB->end(), I.getOperand(0), I.op_begin()+1, I.op_end(), 
-                   outputReg);
+                   outputReg, false, 0);
  }
  
  /// emitGEPOperation - Common code shared between visitGetElementPtrInst and
@@ -2803,16 +3123,16 @@ void ISel::visitGetElementPtrInst(GetElementPtrInst &I) {
  void ISel::emitGEPOperation(MachineBasicBlock *MBB,
                              MachineBasicBlock::iterator IP,
                              Value *Src, User::op_iterator IdxBegin,
-                            User::op_iterator IdxEnd, unsigned TargetReg) {
+                            User::op_iterator IdxEnd, unsigned TargetReg,
+                            bool GEPIsFolded, ConstantSInt **RemainderPtr) {
    const TargetData &TD = TM.getTargetData();
    const Type *Ty = Src->getType();
    unsigned basePtrReg = getReg(Src, MBB, IP);
    int64_t constValue = 0;
-  bool anyCombined = false;
    
    // Record the operations to emit the GEP in a vector so that we can emit them
    // after having analyzed the entire instruction.
-  std::vector<CollapsedGepOp*> ops;
+  std::vector<CollapsedGepOp> ops;
    
    // GEPs have zero or more indices; we must perform a struct access
    // or array access for each one.
@@ -2829,7 +3149,6 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
        unsigned fieldIndex = cast<ConstantUInt>(idx)->getValue();
        unsigned memberOffset =
          TD.getStructLayout(StTy)->MemberOffsets[fieldIndex];
-      if (constValue != 0) anyCombined = true;
  
        // StructType member offsets are always constant values.  Add it to the
        // running total.
@@ -2854,8 +3173,6 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
        unsigned elementSize = TD.getTypeSize(Ty);
        
        if (ConstantInt *C = dyn_cast<ConstantInt>(idx)) {
-        if (constValue != 0) anyCombined = true;
-
          if (ConstantSInt *CS = dyn_cast<ConstantSInt>(C))
            constValue += CS->getValue() * elementSize;
          else if (ConstantUInt *CU = dyn_cast<ConstantUInt>(C))
@@ -2864,48 +3181,40 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
            assert(0 && "Invalid ConstantInt GEP index type!");
        } else {
          // Push current gep state to this point as an add
-        CollapsedGepOp *addition = 
-          new CollapsedGepOp(false, 0, ConstantSInt::get(Type::IntTy,
-                constValue));
-        ops.push_back(addition);
+        ops.push_back(CollapsedGepOp(false, 0, 
+          ConstantSInt::get(Type::IntTy,constValue)));
          
          // Push multiply gep op and reset constant value
-        CollapsedGepOp *multiply = 
-          new CollapsedGepOp(true, idx, ConstantSInt::get(Type::IntTy, 
-                elementSize));
-        ops.push_back(multiply);
+        ops.push_back(CollapsedGepOp(true, idx, 
+          ConstantSInt::get(Type::IntTy, elementSize)));
          
          constValue = 0;
        }
      }
    }
-  // Do some statistical accounting
-  if (ops.empty()) ++GEPConsts; 
-  if (anyCombined) ++GEPSplits;
-    
    // Emit instructions for all the collapsed ops
-  for(std::vector<CollapsedGepOp *>::iterator cgo_i = ops.begin(),
+  for(std::vector<CollapsedGepOp>::iterator cgo_i = ops.begin(),
        cgo_e = ops.end(); cgo_i != cgo_e; ++cgo_i) {
-    CollapsedGepOp *cgo = *cgo_i;
+    CollapsedGepOp& cgo = *cgo_i;
      unsigned nextBasePtrReg = makeAnotherReg (Type::IntTy);
  
-    if (cgo->isMul) {
+    if (cgo.isMul) {
        // We know the elementSize is a constant, so we can emit a constant mul
        // and then add it to the current base reg
        unsigned TmpReg = makeAnotherReg(Type::IntTy);
-      doMultiplyConst(MBB, IP, TmpReg, cgo->index, cgo->size);
+      doMultiplyConst(MBB, IP, TmpReg, cgo.index, cgo.size);
        BuildMI(*MBB, IP, PPC32::ADD, 2, nextBasePtrReg).addReg(basePtrReg)
          .addReg(TmpReg);
      } else {
        // Try and generate an immediate addition if possible
-      if (cgo->size->isNullValue()) {
+      if (cgo.size->isNullValue()) {
          BuildMI(*MBB, IP, PPC32::OR, 2, nextBasePtrReg).addReg(basePtrReg)
            .addReg(basePtrReg);
-      } else if (canUseAsImmediateForOpcode(cgo->size, 0)) {
+      } else if (canUseAsImmediateForOpcode(cgo.size, 0)) {
          BuildMI(*MBB, IP, PPC32::ADDI, 2, nextBasePtrReg).addReg(basePtrReg)
-          .addSImm(cgo->size->getValue());
+          .addSImm(cgo.size->getValue());
        } else {
-        unsigned Op1r = getReg(cgo->size, MBB, IP);
+        unsigned Op1r = getReg(cgo.size, MBB, IP);
          BuildMI(*MBB, IP, PPC32::ADD, 2, nextBasePtrReg).addReg(basePtrReg)
            .addReg(Op1r);
        }
@@ -2916,6 +3225,15 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
    // Add the current base register plus any accumulated constant value
    ConstantSInt *remainder = ConstantSInt::get(Type::IntTy, constValue);
    
+  // If we are emitting this during a fold, copy the current base register to
+  // the target, and save the current constant offset so the folding load or
+  // store can try and use it as an immediate.
+  if (GEPIsFolded) {
+    BuildMI (BB, PPC32::OR, 2, TargetReg).addReg(basePtrReg).addReg(basePtrReg);
+    *RemainderPtr = remainder;
+    return;
+  }
+  
    // After we have processed all the indices, the result is left in
    // basePtrReg.  Move it to the register where we were expected to
    // put the answer.
diff --git a/lib/Target/PowerPC/PowerPCISelSimple.cpp b/lib/Target/PowerPC/PowerPCISelSimple.cpp

index c838bdaa2f5fa34f17bb9dbdb4bc7db6574c06cc..ae7cf3ed40c6b803e4aaeed725fa8042d7fea1e9 100644 (file)
--- a/lib/Target/PowerPC/PowerPCISelSimple.cpp
+++ b/lib/Target/PowerPC/PowerPCISelSimple.cpp
@@ -32,8 +32,7 @@
  using namespace llvm;
  
  namespace {
-  Statistic<> GEPConsts("ppc-codegen", "Number of const GEPs");
-  Statistic<> GEPSplits("ppc-codegen", "Number of partially const GEPs");
+  Statistic<> GEPFolds("ppc-codegen", "Number of GEPs folded");
  
    /// TypeClass - Used by the PowerPC backend to group LLVM types by their basic
    /// PPC Representation.
@@ -41,17 +40,6 @@ namespace {
    enum TypeClass {
      cByte, cShort, cInt, cFP32, cFP64, cLong
    };
-
-  // This struct is for recording the necessary operations to emit the GEP
-  typedef struct CollapsedGepOp {
-  public:
-    CollapsedGepOp(bool mul, Value *i, ConstantSInt *s) : 
-      isMul(mul), index(i), size(s) {}
-  
-    bool isMul;
-    Value *index;
-    ConstantSInt *size;
-  } CollapsedGepOp;
  }
  
  /// getClass - Turn a primitive type into a "class" number which is based on the
@@ -90,7 +78,7 @@ namespace {
      MachineFunction *F;                 // The function we are compiling into
      MachineBasicBlock *BB;              // The current MBB we are compiling
      int VarArgsFrameIndex;              // FrameIndex for start of varargs area
-
+    
      std::map<Value*, unsigned> RegMap;  // Mapping between Values and SSA Regs
  
      // External functions used in the Module
@@ -105,6 +93,11 @@ namespace {
      // FrameIndex for the alloca.
      std::map<AllocaInst*, unsigned> AllocaMap;
  
+    // A Reg to hold the base address used for global loads and stores, and a
+    // flag to set whether or not we need to emit it for this function.
+    unsigned GlobalBaseReg;
+    bool GlobalBaseInitialized;
+    
      ISel(TargetMachine &tm) : TM(reinterpret_cast<PowerPCTargetMachine&>(tm)), 
        F(0), BB(0) {}
  
@@ -158,6 +151,9 @@ namespace {
  
        BB = &F->front();
  
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseInitialized = false;
+
        // Copy incoming arguments off of the stack...
        LoadArgumentsToVirtualRegs(Fn);
  
@@ -219,6 +215,16 @@ namespace {
        ValueRecord(unsigned R, const Type *T) : Val(0), Reg(R), Ty(T) {}
        ValueRecord(Value *V) : Val(V), Reg(0), Ty(V->getType()) {}
      };
+    
+    // This struct is for recording the necessary operations to emit the GEP
+    struct CollapsedGepOp {
+      bool isMul;
+      Value *index;
+      ConstantSInt *size;
+      CollapsedGepOp(bool mul, Value *i, ConstantSInt *s) :
+        isMul(mul), index(i), size(s) {}
+    };
+
      void doCall(const ValueRecord &Ret, MachineInstr *CallMI,
                  const std::vector<ValueRecord> &Args, bool isVarArg);
      void visitCallInst(CallInst &I);
@@ -276,7 +282,8 @@ namespace {
      ///
      void emitGEPOperation(MachineBasicBlock *BB, MachineBasicBlock::iterator IP,
                            Value *Src, User::op_iterator IdxBegin,
-                          User::op_iterator IdxEnd, unsigned TargetReg);
+                          User::op_iterator IdxEnd, unsigned TargetReg,
+                          bool CollapseRemainder, ConstantSInt **Remainder);
  
      /// emitCastOperation - Common code shared between visitCastInst and
      /// constant expression cast support.
@@ -335,11 +342,19 @@ namespace {
        
      /// emitSelectOperation - Common code shared between visitSelectInst and the
      /// constant expression support.
+    ///
      void emitSelectOperation(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator IP,
                               Value *Cond, Value *TrueVal, Value *FalseVal,
                               unsigned DestReg);
  
+    /// copyGlobalBaseToRegister - Output the instructions required to put the
+    /// base address to use for accessing globals into a register.
+    ///
+    void ISel::copyGlobalBaseToRegister(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator IP,
+                                        unsigned R);
+
      /// copyConstantToRegister - Output the instructions required to put the
      /// specified constant into the specified register.
      ///
@@ -417,10 +432,6 @@ unsigned ISel::getReg(Value *V, MachineBasicBlock *MBB,
      unsigned Reg = makeAnotherReg(V->getType());
      copyConstantToRegister(MBB, IPt, C, Reg);
      return Reg;
-  } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
-    // Do not emit noop casts at all.
-    if (getClassB(CI->getType()) == getClassB(CI->getOperand(0)->getType()))
-      return getReg(CI->getOperand(0), MBB, IPt);
    } else if (AllocaInst *AI = dyn_castFixedAlloca(V)) {
      unsigned Reg = makeAnotherReg(V->getType());
      unsigned FI = getFixedSizedAllocaFI(AI);
@@ -500,6 +511,26 @@ unsigned ISel::getFixedSizedAllocaFI(AllocaInst *AI) {
  }
  
  
+/// copyGlobalBaseToRegister - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+void ISel::copyGlobalBaseToRegister(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator IP,
+                                    unsigned R) {
+  if (!GlobalBaseInitialized) {
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = F->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    GlobalBaseReg = makeAnotherReg(Type::IntTy);
+    BuildMI(FirstMBB, MBBI, PPC32::MovePCtoLR, 0, GlobalBaseReg);
+    GlobalBaseInitialized = true;
+  }
+  // Emit our copy of GlobalBaseReg to the destination register in the
+  // current MBB
+  BuildMI(*MBB, IP, PPC32::OR, 2, R).addReg(GlobalBaseReg)
+    .addReg(GlobalBaseReg);
+}
+
  /// copyConstantToRegister - Output the instructions required to put the
  /// specified constant into the specified register.
  ///
@@ -567,14 +598,13 @@ void ISel::copyConstantToRegister(MachineBasicBlock *MBB,
  
      assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
  
-    // Load addr of constant to reg; constant is located at PC + distance
-    unsigned CurPC = makeAnotherReg(Type::IntTy);
+    // Load addr of constant to reg; constant is located at base + distance
+    unsigned GlobalBase = makeAnotherReg(Type::IntTy);
      unsigned Reg1 = makeAnotherReg(Type::IntTy);
      unsigned Reg2 = makeAnotherReg(Type::IntTy);
-    // Move PC to destination reg
-    BuildMI(*MBB, IP, PPC32::MovePCtoLR, 0, CurPC);
-    // Move value at PC + distance into return reg
-    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, Reg1).addReg(CurPC)
+    // Move value at base + distance into return reg
+    copyGlobalBaseToRegister(MBB, IP, GlobalBase);
+    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, Reg1).addReg(GlobalBase)
        .addConstantPoolIndex(CPI);
      BuildMI(*MBB, IP, PPC32::LOADLoDirect, 2, Reg2).addReg(Reg1)
        .addConstantPoolIndex(CPI);
@@ -585,16 +615,15 @@ void ISel::copyConstantToRegister(MachineBasicBlock *MBB,
      // Copy zero (null pointer) to the register.
      BuildMI(*MBB, IP, PPC32::LI, 1, R).addSImm(0);
    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
-    // GV is located at PC + distance
-    unsigned CurPC = makeAnotherReg(Type::IntTy);
+    // GV is located at base + distance
+    unsigned GlobalBase = makeAnotherReg(Type::IntTy);
      unsigned TmpReg = makeAnotherReg(GV->getType());
      unsigned Opcode = (GV->hasWeakLinkage() || GV->isExternal()) ? 
        PPC32::LOADLoIndirect : PPC32::LOADLoDirect;
-      
-    // Move PC to destination reg
-    BuildMI(*MBB, IP, PPC32::MovePCtoLR, 0, CurPC);
-    // Move value at PC + distance into return reg
-    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, TmpReg).addReg(CurPC)
+    
+    // Move value at base + distance into return reg
+    copyGlobalBaseToRegister(MBB, IP, GlobalBase);
+    BuildMI(*MBB, IP, PPC32::LOADHiAddr, 2, TmpReg).addReg(GlobalBase)
        .addGlobalAddress(GV);
      BuildMI(*MBB, IP, Opcode, 2, R).addReg(TmpReg).addGlobalAddress(GV);
    
@@ -740,7 +769,7 @@ void ISel::LoadArgumentsToVirtualRegs(Function &Fn) {
    // the start of the first vararg value... this is used to expand
    // llvm.va_start.
    if (Fn.getFunctionType()->isVarArg())
-    VarArgsFrameIndex = MFI->CreateFixedObject(1, ArgOffset);
+    VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset);
  }
  
  
@@ -861,6 +890,32 @@ static SetCondInst *canFoldSetCCIntoBranchOrSelect(Value *V) {
    return 0;
  }
  
+
+// canFoldGEPIntoLoadOrStore - Return the GEP instruction if we can fold it into
+// the load or store instruction that is the only user of the GEP.
+//
+static GetElementPtrInst *canFoldGEPIntoLoadOrStore(Value *V) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(V))
+    if (GEPI->hasOneUse()) {
+      Instruction *User = cast<Instruction>(GEPI->use_back());
+      if (isa<StoreInst>(User) &&
+          GEPI->getParent() == User->getParent() &&
+          User->getOperand(0) != GEPI &&
+          User->getOperand(1) == GEPI) {
+        ++GEPFolds;
+        return GEPI;
+      }
+      if (isa<LoadInst>(User) &&
+          GEPI->getParent() == User->getParent() &&
+          User->getOperand(0) == GEPI) {
+        ++GEPFolds;
+        return GEPI;
+      }
+    }
+  return 0;
+}
+
+
  // Return a fixed numbering for setcc instructions which does not depend on the
  // order of the opcodes.
  //
@@ -917,6 +972,26 @@ unsigned ISel::EmitComparison(unsigned OpNum, Value *Op0, Value *Op1,
    unsigned Class = getClassB(CompTy);
    unsigned Op0r = getReg(Op0, MBB, IP);
  
+  // Before we do a comparison, we have to make sure that we're truncating our
+  // registers appropriately.
+  if (Class == cByte) {
+    unsigned TmpReg = makeAnotherReg(CompTy);
+    if (CompTy->isSigned())
+      BuildMI(*MBB, IP, PPC32::EXTSB, 1, TmpReg).addReg(Op0r);
+    else
+      BuildMI(*MBB, IP, PPC32::RLWINM, 4, TmpReg).addReg(Op0r).addImm(0)
+        .addImm(24).addImm(31);
+    Op0r = TmpReg;
+  } else if (Class == cShort) {
+    unsigned TmpReg = makeAnotherReg(CompTy);
+    if (CompTy->isSigned())
+      BuildMI(*MBB, IP, PPC32::EXTSH, 1, TmpReg).addReg(Op0r);
+    else
+      BuildMI(*MBB, IP, PPC32::RLWINM, 4, TmpReg).addReg(Op0r).addImm(0)
+        .addImm(16).addImm(31);
+    Op0r = TmpReg;
+  }
+  
    // Use crand for lt, gt and crandc for le, ge
    unsigned CROpcode = (OpNum == 2 || OpNum == 4) ? PPC32::CRAND : PPC32::CRANDC;
    // ? cr1[lt] : cr1[gt]
@@ -1387,7 +1462,8 @@ void ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
            BuildMI(BB, PPC32::OR, 2, GPR[GPR_idx]).addReg(ArgReg)
              .addReg(ArgReg);
            CallMI->addRegOperand(GPR[GPR_idx], MachineOperand::Use);
-        } else {
+        }
+        if (GPR_remaining <= 0 || isVarArg) {
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg).addSImm(ArgOffset)
              .addReg(PPC32::R1);
          }
@@ -1400,7 +1476,8 @@ void ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
            BuildMI(BB, PPC32::OR, 2, GPR[GPR_idx]).addReg(ArgReg)
              .addReg(ArgReg);
            CallMI->addRegOperand(GPR[GPR_idx], MachineOperand::Use);
-        } else {
+        }
+        if (GPR_remaining <= 0 || isVarArg) {
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg).addSImm(ArgOffset)
              .addReg(PPC32::R1);
          }
@@ -1417,7 +1494,8 @@ void ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
              .addReg(ArgReg+1);
            CallMI->addRegOperand(GPR[GPR_idx], MachineOperand::Use);
            CallMI->addRegOperand(GPR[GPR_idx+1], MachineOperand::Use);
-        } else {
+        }
+        if (GPR_remaining <= 1 || isVarArg) {
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg).addSImm(ArgOffset)
              .addReg(PPC32::R1);
            BuildMI(BB, PPC32::STW, 3).addReg(ArgReg+1).addSImm(ArgOffset+4)
@@ -1599,8 +1677,10 @@ void ISel::LowerUnknownIntrinsicFunctionCalls(Function &F) {
            case Intrinsic::vaend:
            case Intrinsic::returnaddress:
            case Intrinsic::frameaddress:
-            // FIXME: should lower this ourselves
+            // FIXME: should lower these ourselves
              // case Intrinsic::isunordered:
+            // case Intrinsic::memcpy: -> doCall().  system memcpy almost
+            // guaranteed to be faster than anything we generate ourselves
              // We directly implement these intrinsics
              break;
            case Intrinsic::readio: {
@@ -1674,7 +1754,7 @@ void ISel::visitIntrinsicCall(Intrinsic::ID ID, CallInst &CI) {
        BuildMI(BB, PPC32::LI, 1, TmpReg1).addSImm(0);
      }
      return;
-
+    
  #if 0
      // This may be useful for supporting isunordered
    case Intrinsic::isnan:
@@ -2289,7 +2369,7 @@ void ISel::emitShiftOperation(MachineBasicBlock *MBB,
          if (isSigned) {
            // FIXME: Unimplemented
            // Page C-3 of the PowerPC 32bit Programming Environments Manual
-          std::cerr << "ERROR: Unimplemented: signed right shift\n";
+          std::cerr << "ERROR: Unimplemented: signed right shift of long\n";
            abort();
          } else {
            BuildMI(*MBB, IP, PPC32::SUBFIC, 2, TmpReg1).addReg(ShiftAmountReg)
@@ -2344,68 +2424,168 @@ void ISel::emitShiftOperation(MachineBasicBlock *MBB,
  }
  
  
-/// visitLoadInst - Implement LLVM load instructions
+/// visitLoadInst - Implement LLVM load instructions.  Pretty straightforward
+/// mapping of LLVM classes to PPC load instructions, with the exception of
+/// signed byte loads, which need a sign extension following them.
  ///
  void ISel::visitLoadInst(LoadInst &I) {
-  static const unsigned Opcodes[] = { 
-    PPC32::LBZ, PPC32::LHZ, PPC32::LWZ, PPC32::LFS 
+  // Immediate opcodes, for reg+imm addressing
+  static const unsigned ImmOpcodes[] = { 
+    PPC32::LBZ, PPC32::LHZ, PPC32::LWZ, 
+    PPC32::LFS, PPC32::LFD, PPC32::LWZ
+  };
+  // Indexed opcodes, for reg+reg addressing
+  static const unsigned IdxOpcodes[] = {
+    PPC32::LBZX, PPC32::LHZX, PPC32::LWZX,
+    PPC32::LFSX, PPC32::LFDX, PPC32::LWZX
    };
  
-  unsigned Class = getClassB(I.getType());
-  unsigned Opcode = Opcodes[Class];
-  if (I.getType() == Type::DoubleTy) Opcode = PPC32::LFD;
-  if (Class == cShort && I.getType()->isSigned()) Opcode = PPC32::LHA;
-  unsigned DestReg = getReg(I);
+  unsigned Class     = getClassB(I.getType());
+  unsigned ImmOpcode = ImmOpcodes[Class];
+  unsigned IdxOpcode = IdxOpcodes[Class];
+  unsigned DestReg   = getReg(I);
+  Value *SourceAddr  = I.getOperand(0);
+  
+  if (Class == cShort && I.getType()->isSigned()) ImmOpcode = PPC32::LHA;
+  if (Class == cShort && I.getType()->isSigned()) IdxOpcode = PPC32::LHAX;
  
-  if (AllocaInst *AI = dyn_castFixedAlloca(I.getOperand(0))) {
+  if (AllocaInst *AI = dyn_castFixedAlloca(SourceAddr)) {
      unsigned FI = getFixedSizedAllocaFI(AI);
      if (Class == cLong) {
-      addFrameReference(BuildMI(BB, PPC32::LWZ, 2, DestReg), FI);
-      addFrameReference(BuildMI(BB, PPC32::LWZ, 2, DestReg+1), FI, 4);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, DestReg), FI);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, DestReg+1), FI, 4);
      } else if (Class == cByte && I.getType()->isSigned()) {
        unsigned TmpReg = makeAnotherReg(I.getType());
-      addFrameReference(BuildMI(BB, Opcode, 2, TmpReg), FI);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, TmpReg), FI);
        BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
      } else {
-      addFrameReference(BuildMI(BB, Opcode, 2, DestReg), FI);
+      addFrameReference(BuildMI(BB, ImmOpcode, 2, DestReg), FI);
+    }
+    return;
+  }
+  
+  // If this load is the only use of the GEP instruction that is its address,
+  // then we can fold the GEP directly into the load instruction.
+  // emitGEPOperation with a second to last arg of 'true' will place the
+  // base register for the GEP into baseReg, and the constant offset from that
+  // into offset.  If the offset fits in 16 bits, we can emit a reg+imm store
+  // otherwise, we copy the offset into another reg, and use reg+reg addressing.
+  if (GetElementPtrInst *GEPI = canFoldGEPIntoLoadOrStore(SourceAddr)) {
+    unsigned baseReg = getReg(GEPI);
+    ConstantSInt *offset;
+    
+    emitGEPOperation(BB, BB->end(), GEPI->getOperand(0), GEPI->op_begin()+1, 
+                     GEPI->op_end(), baseReg, true, &offset);
+
+    if (Class != cLong && canUseAsImmediateForOpcode(offset, 0)) {
+      if (Class == cByte && I.getType()->isSigned()) {
+        unsigned TmpReg = makeAnotherReg(I.getType());
+        BuildMI(BB, ImmOpcode, 2, TmpReg).addSImm(offset->getValue())
+          .addReg(baseReg);
+        BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
+      } else {
+        BuildMI(BB, ImmOpcode, 2, DestReg).addSImm(offset->getValue())
+          .addReg(baseReg);
+      }
+      return;
      }
-  } else {
-    unsigned SrcAddrReg = getReg(I.getOperand(0));
      
+    unsigned indexReg = getReg(offset);
+
      if (Class == cLong) {
-      BuildMI(BB, PPC32::LWZ, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
-      BuildMI(BB, PPC32::LWZ, 2, DestReg+1).addSImm(4).addReg(SrcAddrReg);
+      unsigned indexPlus4 = makeAnotherReg(Type::IntTy);
+      BuildMI(BB, PPC32::ADDI, 2, indexPlus4).addReg(indexReg).addSImm(4);
+      BuildMI(BB, IdxOpcode, 2, DestReg).addReg(indexReg).addReg(baseReg);
+      BuildMI(BB, IdxOpcode, 2, DestReg+1).addReg(indexPlus4).addReg(baseReg);
      } else if (Class == cByte && I.getType()->isSigned()) {
        unsigned TmpReg = makeAnotherReg(I.getType());
-      BuildMI(BB, Opcode, 2, TmpReg).addSImm(0).addReg(SrcAddrReg);
+      BuildMI(BB, IdxOpcode, 2, DestReg).addReg(indexReg).addReg(baseReg);
        BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
      } else {
-      BuildMI(BB, Opcode, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
+      BuildMI(BB, IdxOpcode, 2, DestReg).addReg(indexReg).addReg(baseReg);
      }
+    return;
+  }
+  
+  // The fallback case, where the load was from a source that could not be
+  // folded into the load instruction. 
+  unsigned SrcAddrReg = getReg(SourceAddr);
+    
+  if (Class == cLong) {
+    BuildMI(BB, ImmOpcode, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
+    BuildMI(BB, ImmOpcode, 2, DestReg+1).addSImm(4).addReg(SrcAddrReg);
+  } else if (Class == cByte && I.getType()->isSigned()) {
+    unsigned TmpReg = makeAnotherReg(I.getType());
+    BuildMI(BB, ImmOpcode, 2, TmpReg).addSImm(0).addReg(SrcAddrReg);
+    BuildMI(BB, PPC32::EXTSB, 1, DestReg).addReg(TmpReg);
+  } else {
+    BuildMI(BB, ImmOpcode, 2, DestReg).addSImm(0).addReg(SrcAddrReg);
    }
  }
  
  /// visitStoreInst - Implement LLVM store instructions
  ///
  void ISel::visitStoreInst(StoreInst &I) {
-  unsigned ValReg      = getReg(I.getOperand(0));
-  unsigned AddressReg  = getReg(I.getOperand(1));
- 
-  const Type *ValTy = I.getOperand(0)->getType();
-  unsigned Class = getClassB(ValTy);
+  // Immediate opcodes, for reg+imm addressing
+  static const unsigned ImmOpcodes[] = {
+    PPC32::STB, PPC32::STH, PPC32::STW, 
+    PPC32::STFS, PPC32::STFD, PPC32::STW
+  };
+  // Indexed opcodes, for reg+reg addressing
+  static const unsigned IdxOpcodes[] = {
+    PPC32::STBX, PPC32::STHX, PPC32::STWX, 
+    PPC32::STFSX, PPC32::STDX, PPC32::STWX
+  };
+  
+  Value *SourceAddr  = I.getOperand(1);
+  const Type *ValTy  = I.getOperand(0)->getType();
+  unsigned Class     = getClassB(ValTy);
+  unsigned ImmOpcode = ImmOpcodes[Class];
+  unsigned IdxOpcode = IdxOpcodes[Class];
+  unsigned ValReg    = getReg(I.getOperand(0));
+
+  // If this store is the only use of the GEP instruction that is its address,
+  // then we can fold the GEP directly into the store instruction.
+  // emitGEPOperation with a second to last arg of 'true' will place the
+  // base register for the GEP into baseReg, and the constant offset from that
+  // into offset.  If the offset fits in 16 bits, we can emit a reg+imm store
+  // otherwise, we copy the offset into another reg, and use reg+reg addressing.
+  if (GetElementPtrInst *GEPI = canFoldGEPIntoLoadOrStore(SourceAddr)) {
+    unsigned baseReg = getReg(GEPI);
+    ConstantSInt *offset;
+    
+    emitGEPOperation(BB, BB->end(), GEPI->getOperand(0), GEPI->op_begin()+1, 
+                     GEPI->op_end(), baseReg, true, &offset);
+
+    if (Class != cLong && canUseAsImmediateForOpcode(offset, 0)) {
+      BuildMI(BB, ImmOpcode, 3).addReg(ValReg).addSImm(offset->getValue())
+        .addReg(baseReg);
+      return;
+    }
+    
+    unsigned indexReg = getReg(offset);
  
+    if (Class == cLong) {
+      unsigned indexPlus4 = makeAnotherReg(Type::IntTy);
+      BuildMI(BB, PPC32::ADDI, 2, indexPlus4).addReg(indexReg).addSImm(4);
+      BuildMI(BB, IdxOpcode, 3).addReg(ValReg).addReg(indexReg).addReg(baseReg);
+      BuildMI(BB, IdxOpcode, 3).addReg(ValReg+1).addReg(indexPlus4)
+        .addReg(baseReg);
+      return;
+    }
+    BuildMI(BB, IdxOpcode, 3).addReg(ValReg).addReg(indexReg).addReg(baseReg);
+    return;
+  }
+  
+  // If the store address wasn't the only use of a GEP, we fall back to the
+  // standard path: store the ValReg at the value in AddressReg.
+  unsigned AddressReg  = getReg(I.getOperand(1));
    if (Class == cLong) {
-    BuildMI(BB, PPC32::STW, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
-    BuildMI(BB, PPC32::STW, 3).addReg(ValReg+1).addSImm(4).addReg(AddressReg);
+    BuildMI(BB, ImmOpcode, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
+    BuildMI(BB, ImmOpcode, 3).addReg(ValReg+1).addSImm(4).addReg(AddressReg);
      return;
    }
-
-  static const unsigned Opcodes[] = {
-    PPC32::STB, PPC32::STH, PPC32::STW, PPC32::STFS
-  };
-  unsigned Opcode = Opcodes[Class];
-  if (ValTy == Type::DoubleTy) Opcode = PPC32::STFD;
-  BuildMI(BB, Opcode, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
+  BuildMI(BB, ImmOpcode, 3).addReg(ValReg).addSImm(0).addReg(AddressReg);
  }
  
  
@@ -2417,10 +2597,6 @@ void ISel::visitCastInst(CastInst &CI) {
  
    unsigned SrcClass = getClassB(Op->getType());
    unsigned DestClass = getClassB(CI.getType());
-  // Noop casts are not emitted: getReg will return the source operand as the
-  // register to use for any uses of the noop cast.
-  if (DestClass == SrcClass)
-    return;
  
    // If this is a cast from a 32-bit integer to a Long type, and the only uses
    // of the case are GEP instructions, then the cast does not need to be
@@ -2484,24 +2660,6 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
      return;
    }
  
-  // Implement casts between values of the same type class (as determined by
-  // getClass) by using a register-to-register move.
-  if (SrcClass == DestClass) {
-    if (SrcClass <= cInt) {
-      BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
-    } else if (SrcClass == cFP32 || SrcClass == cFP64) {
-      BuildMI(*MBB, IP, PPC32::FMR, 1, DestReg).addReg(SrcReg);
-    } else if (SrcClass == cLong) {
-      BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
-      BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
-        .addReg(SrcReg+1);
-    } else {
-      assert(0 && "Cannot handle this type of cast instruction!");
-      abort();
-    }
-    return;
-  }
-  
    // Handle cast of Float -> Double
    if (SrcClass == cFP32 && DestClass == cFP64) {
      BuildMI(*MBB, IP, PPC32::FMR, 1, DestReg).addReg(SrcReg);
@@ -2514,52 +2672,6 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
      return;
    }
    
-  // Handle cast of SMALLER int to LARGER int using a move with sign extension
-  // or zero extension, depending on whether the source type was signed.
-  if (SrcClass <= cInt && (DestClass <= cInt || DestClass == cLong) &&
-      SrcClass < DestClass) {
-    bool isLong = DestClass == cLong;
-    if (isLong) {
-      DestClass = cInt;
-      ++DestReg;
-    }
-    
-    bool isUnsigned = DestTy->isUnsigned() || DestTy == Type::BoolTy;
-    BuildMI(*BB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
-
-    if (isLong) {  // Handle upper 32 bits as appropriate...
-      --DestReg;
-      if (isUnsigned)     // Zero out top bits...
-        BuildMI(*BB, IP, PPC32::LI, 1, DestReg).addSImm(0);
-      else                // Sign extend bottom half...
-        BuildMI(*BB, IP, PPC32::SRAWI, 2, DestReg).addReg(SrcReg).addImm(31);
-    }
-    return;
-  }
-
-  // Special case long -> int ...
-  if (SrcClass == cLong && DestClass == cInt) {
-    BuildMI(*BB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg+1).addReg(SrcReg+1);
-    return;
-  }
-  
-  // Handle cast of LARGER int to SMALLER int with a clear or sign extend
-  if ((SrcClass <= cInt || SrcClass == cLong) && DestClass <= cInt && 
-      SrcClass > DestClass) {
-    bool isUnsigned = DestTy->isUnsigned() || DestTy == Type::BoolTy;
-    unsigned source = (SrcClass == cLong) ? SrcReg+1 : SrcReg;
-    
-    if (isUnsigned) {
-      unsigned shift = (DestClass == cByte) ? 24 : 16;
-      BuildMI(*BB, IP, PPC32::RLWINM, 4, DestReg).addReg(source).addZImm(0)
-        .addImm(shift).addImm(31);
-    } else {
-      BuildMI(*BB, IP, (DestClass == cByte) ? PPC32::EXTSB : PPC32::EXTSH, 1, 
-              DestReg).addReg(source);
-    }
-    return;
-  }
-
    // Handle casts from integer to floating point now...
    if (DestClass == cFP32 || DestClass == cFP64) {
  
@@ -2624,7 +2736,7 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
        addFrameReference(BuildMI(*BB, IP, PPC32::LFD, 2, ConstF), 
                          ConstantFrameIndex);
        addFrameReference(BuildMI(*BB, IP, PPC32::LFD, 2, TempF), ValueFrameIdx);
-      BuildMI(*BB, IP, PPC32::FSUB, 2, DestReg).addReg(TempF ).addReg(ConstF);
+      BuildMI(*BB, IP, PPC32::FSUB, 2, DestReg).addReg(TempF).addReg(ConstF);
      }
      return;
    }
@@ -2647,23 +2759,26 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
        F->getFrameInfo()->CreateStackObject(SrcTy, TM.getTargetData());
  
      if (DestTy->isSigned()) {
-      unsigned LoadOp = (DestClass == cShort) ? PPC32::LHA : PPC32::LWZ;
        unsigned TempReg = makeAnotherReg(Type::DoubleTy);
        
        // Convert to integer in the FP reg and store it to a stack slot
        BuildMI(*BB, IP, PPC32::FCTIWZ, 1, TempReg).addReg(SrcReg);
        addFrameReference(BuildMI(*BB, IP, PPC32::STFD, 3)
                            .addReg(TempReg), ValueFrameIdx);
-      
-      // There is no load signed byte opcode, so we must emit a sign extend
+
+      // There is no load signed byte opcode, so we must emit a sign extend for
+      // that particular size.  Make sure to source the new integer from the 
+      // correct offset.
        if (DestClass == cByte) {
          unsigned TempReg2 = makeAnotherReg(DestTy);
-        addFrameReference(BuildMI(*BB, IP, LoadOp, 2, TempReg2), 
-                          ValueFrameIdx, 4);
+        addFrameReference(BuildMI(*BB, IP, PPC32::LBZ, 2, TempReg2), 
+                          ValueFrameIdx, 7);
          BuildMI(*MBB, IP, PPC32::EXTSB, DestReg).addReg(TempReg2);
        } else {
+        int offset = (DestClass == cShort) ? 6 : 4;
+        unsigned LoadOp = (DestClass == cShort) ? PPC32::LHA : PPC32::LWZ;
          addFrameReference(BuildMI(*BB, IP, LoadOp, 2, DestReg), 
-                          ValueFrameIdx, 4);
+                          ValueFrameIdx, offset);
        }
      } else {
        unsigned Zero = getReg(ConstantFP::get(Type::DoubleTy, 0.0f));
@@ -2710,29 +2825,228 @@ void ISel::emitCastOperation(MachineBasicBlock *MBB,
        BuildMI(*BB, IP, PPC32::FCTIWZ, 1, ConvReg).addReg(TmpReg2);
        addFrameReference(BuildMI(*BB, IP, PPC32::STFD, 3).addReg(ConvReg),
                          FrameIdx);
-      addFrameReference(BuildMI(*BB, IP, PPC32::LWZ, 2, IntTmp),
-                        FrameIdx, 4);
-      BuildMI(*BB, IP, PPC32::BLT, 2).addReg(PPC32::CR0).addMBB(PhiMBB);
-      BuildMI(*BB, IP, PPC32::B, 1).addMBB(XorMBB);
-
-      // XorMBB:
-      //   add 2**31 if input was >= 2**31
-      BB = XorMBB;
-      BuildMI(BB, PPC32::XORIS, 2, XorReg).addReg(IntTmp).addImm(0x8000);
-      BuildMI(BB, PPC32::B, 1).addMBB(PhiMBB);
-      XorMBB->addSuccessor(PhiMBB);
-
-      // PhiMBB:
-      //   DestReg = phi [ IntTmp, OldMBB ], [ XorReg, XorMBB ]
-      BB = PhiMBB;
-      BuildMI(BB, PPC32::PHI, 2, DestReg).addReg(IntTmp).addMBB(OldMBB)
-        .addReg(XorReg).addMBB(XorMBB);
+      if (DestClass == cByte) {
+        addFrameReference(BuildMI(*BB, IP, PPC32::LBZ, 2, DestReg),
+                          FrameIdx, 7);
+      } else if (DestClass == cShort) {
+        addFrameReference(BuildMI(*BB, IP, PPC32::LHZ, 2, DestReg),
+                          FrameIdx, 6);
+      } if (DestClass == cInt) {
+        addFrameReference(BuildMI(*BB, IP, PPC32::LWZ, 2, IntTmp),
+                          FrameIdx, 4);
+        BuildMI(*BB, IP, PPC32::BLT, 2).addReg(PPC32::CR0).addMBB(PhiMBB);
+        BuildMI(*BB, IP, PPC32::B, 1).addMBB(XorMBB);
+
+        // XorMBB:
+        //   add 2**31 if input was >= 2**31
+        BB = XorMBB;
+        BuildMI(BB, PPC32::XORIS, 2, XorReg).addReg(IntTmp).addImm(0x8000);
+        XorMBB->addSuccessor(PhiMBB);
+
+        // PhiMBB:
+        //   DestReg = phi [ IntTmp, OldMBB ], [ XorReg, XorMBB ]
+        BB = PhiMBB;
+        BuildMI(BB, PPC32::PHI, 2, DestReg).addReg(IntTmp).addMBB(OldMBB)
+          .addReg(XorReg).addMBB(XorMBB);
+      }
+    }
+    return;
+  }
+
+  // Check our invariants
+  assert((SrcClass <= cInt || SrcClass == cLong) && 
+         "Unhandled source class for cast operation!");
+  assert((DestClass <= cInt || DestClass == cLong) && 
+         "Unhandled destination class for cast operation!");
+
+  bool sourceUnsigned = SrcTy->isUnsigned() || SrcTy == Type::BoolTy;
+  bool destUnsigned = DestTy->isUnsigned();
+
+  // Unsigned -> Unsigned, clear if larger, 
+  if (sourceUnsigned && destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
+          .addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::LI, 1, DestReg).addSImm(0);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle u{ byte, short, int } x u{ byte, short, int }
+    unsigned clearBits = (SrcClass == cByte || DestClass == cByte) ? 24 : 16;
+    switch (SrcClass) {
+    case cByte:
+    case cShort:
+      if (SrcClass == DestClass)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cInt)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      break;
+    }
+    return;
+  }
+
+  // Signed -> Signed
+  if (!sourceUnsigned && !destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
+          .addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::SRAWI, 2, DestReg).addReg(SrcReg).addImm(31);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle { byte, short, int } x { byte, short, int }
+    switch (SrcClass) {
+    case cByte:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      break;
+    case cShort:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      break;
+    }
+    return;
+  }
+
+  // Unsigned -> Signed
+  if (sourceUnsigned && !destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1).
+          addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::LI, 1, DestReg).addSImm(0);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle u{ byte, short, int } -> { byte, short, int }
+    switch (SrcClass) {
+    case cByte:
+      if (DestClass == cByte)
+        // uByte 255 -> signed byte == -1
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else
+        // uByte 255 -> signed short/int == 255
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg).addImm(0)
+          .addImm(24).addImm(31);
+      break;
+    case cShort:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg).addImm(0)
+          .addImm(16).addImm(31);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cByte)
+        BuildMI(*MBB, IP, PPC32::EXTSB, 1, DestReg).addReg(SrcReg);
+      else if (DestClass == cShort)
+        BuildMI(*MBB, IP, PPC32::EXTSH, 1, DestReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      break;
+    }
+    return;
+  }
+
+  // Signed -> Unsigned
+  if (!sourceUnsigned && destUnsigned) {
+    // handle long dest class now to keep switch clean
+    if (DestClass == cLong) {
+      if (SrcClass == cLong) {
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg+1)
+          .addReg(SrcReg+1);
+      } else {
+        BuildMI(*MBB, IP, PPC32::SRAWI, 2, DestReg).addReg(SrcReg).addImm(31);
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg+1).addReg(SrcReg)
+          .addReg(SrcReg);
+      }
+      return;
+    }
+
+    // handle { byte, short, int } -> u{ byte, short, int }
+    unsigned clearBits = (DestClass == cByte) ? 24 : 16;
+    switch (SrcClass) {
+    case cByte:
+    case cShort:
+      if (DestClass == cByte || DestClass == cShort)
+        // sbyte -1 -> ubyte 0x000000FF
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      else
+        // sbyte -1 -> ubyte 0xFFFFFFFF
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      break;
+    case cLong:
+      ++SrcReg;
+      // Fall through
+    case cInt:
+      if (DestClass == cInt)
+        BuildMI(*MBB, IP, PPC32::OR, 2, DestReg).addReg(SrcReg).addReg(SrcReg);
+      else
+        BuildMI(*MBB, IP, PPC32::RLWINM, 4, DestReg).addReg(SrcReg)
+          .addImm(0).addImm(clearBits).addImm(31);
+      break;
      }
      return;
    }
  
    // Anything we haven't handled already, we can't (yet) handle at all.
-  assert(0 && "Unhandled cast instruction!");
+  std::cerr << "Unhandled cast from " << SrcTy->getDescription()
+            << "to " << DestTy->getDescription() << '\n';
    abort();
  }
  
@@ -2783,6 +3097,9 @@ void ISel::visitVAArgInst(VAArgInst &I) {
      BuildMI(BB, PPC32::LWZ, 2, DestReg).addSImm(0).addReg(VAList);
      BuildMI(BB, PPC32::LWZ, 2, DestReg+1).addSImm(4).addReg(VAList);
      break;
+  case Type::FloatTyID:
+    BuildMI(BB, PPC32::LFS, 2, DestReg).addSImm(0).addReg(VAList);
+    break;
    case Type::DoubleTyID:
      BuildMI(BB, PPC32::LFD, 2, DestReg).addSImm(0).addReg(VAList);
      break;
@@ -2792,9 +3109,12 @@ void ISel::visitVAArgInst(VAArgInst &I) {
  /// visitGetElementPtrInst - instruction-select GEP instructions
  ///
  void ISel::visitGetElementPtrInst(GetElementPtrInst &I) {
+  if (canFoldGEPIntoLoadOrStore(&I))
+    return;
+
    unsigned outputReg = getReg(I);
    emitGEPOperation(BB, BB->end(), I.getOperand(0), I.op_begin()+1, I.op_end(), 
-                   outputReg);
+                   outputReg, false, 0);
  }
  
  /// emitGEPOperation - Common code shared between visitGetElementPtrInst and
@@ -2803,16 +3123,16 @@ void ISel::visitGetElementPtrInst(GetElementPtrInst &I) {
  void ISel::emitGEPOperation(MachineBasicBlock *MBB,
                              MachineBasicBlock::iterator IP,
                              Value *Src, User::op_iterator IdxBegin,
-                            User::op_iterator IdxEnd, unsigned TargetReg) {
+                            User::op_iterator IdxEnd, unsigned TargetReg,
+                            bool GEPIsFolded, ConstantSInt **RemainderPtr) {
    const TargetData &TD = TM.getTargetData();
    const Type *Ty = Src->getType();
    unsigned basePtrReg = getReg(Src, MBB, IP);
    int64_t constValue = 0;
-  bool anyCombined = false;
    
    // Record the operations to emit the GEP in a vector so that we can emit them
    // after having analyzed the entire instruction.
-  std::vector<CollapsedGepOp*> ops;
+  std::vector<CollapsedGepOp> ops;
    
    // GEPs have zero or more indices; we must perform a struct access
    // or array access for each one.
@@ -2829,7 +3149,6 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
        unsigned fieldIndex = cast<ConstantUInt>(idx)->getValue();
        unsigned memberOffset =
          TD.getStructLayout(StTy)->MemberOffsets[fieldIndex];
-      if (constValue != 0) anyCombined = true;
  
        // StructType member offsets are always constant values.  Add it to the
        // running total.
@@ -2854,8 +3173,6 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
        unsigned elementSize = TD.getTypeSize(Ty);
        
        if (ConstantInt *C = dyn_cast<ConstantInt>(idx)) {
-        if (constValue != 0) anyCombined = true;
-
          if (ConstantSInt *CS = dyn_cast<ConstantSInt>(C))
            constValue += CS->getValue() * elementSize;
          else if (ConstantUInt *CU = dyn_cast<ConstantUInt>(C))
@@ -2864,48 +3181,40 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
            assert(0 && "Invalid ConstantInt GEP index type!");
        } else {
          // Push current gep state to this point as an add
-        CollapsedGepOp *addition = 
-          new CollapsedGepOp(false, 0, ConstantSInt::get(Type::IntTy,
-                constValue));
-        ops.push_back(addition);
+        ops.push_back(CollapsedGepOp(false, 0, 
+          ConstantSInt::get(Type::IntTy,constValue)));
          
          // Push multiply gep op and reset constant value
-        CollapsedGepOp *multiply = 
-          new CollapsedGepOp(true, idx, ConstantSInt::get(Type::IntTy, 
-                elementSize));
-        ops.push_back(multiply);
+        ops.push_back(CollapsedGepOp(true, idx, 
+          ConstantSInt::get(Type::IntTy, elementSize)));
          
          constValue = 0;
        }
      }
    }
-  // Do some statistical accounting
-  if (ops.empty()) ++GEPConsts; 
-  if (anyCombined) ++GEPSplits;
-    
    // Emit instructions for all the collapsed ops
-  for(std::vector<CollapsedGepOp *>::iterator cgo_i = ops.begin(),
+  for(std::vector<CollapsedGepOp>::iterator cgo_i = ops.begin(),
        cgo_e = ops.end(); cgo_i != cgo_e; ++cgo_i) {
-    CollapsedGepOp *cgo = *cgo_i;
+    CollapsedGepOp& cgo = *cgo_i;
      unsigned nextBasePtrReg = makeAnotherReg (Type::IntTy);
  
-    if (cgo->isMul) {
+    if (cgo.isMul) {
        // We know the elementSize is a constant, so we can emit a constant mul
        // and then add it to the current base reg
        unsigned TmpReg = makeAnotherReg(Type::IntTy);
-      doMultiplyConst(MBB, IP, TmpReg, cgo->index, cgo->size);
+      doMultiplyConst(MBB, IP, TmpReg, cgo.index, cgo.size);
        BuildMI(*MBB, IP, PPC32::ADD, 2, nextBasePtrReg).addReg(basePtrReg)
          .addReg(TmpReg);
      } else {
        // Try and generate an immediate addition if possible
-      if (cgo->size->isNullValue()) {
+      if (cgo.size->isNullValue()) {
          BuildMI(*MBB, IP, PPC32::OR, 2, nextBasePtrReg).addReg(basePtrReg)
            .addReg(basePtrReg);
-      } else if (canUseAsImmediateForOpcode(cgo->size, 0)) {
+      } else if (canUseAsImmediateForOpcode(cgo.size, 0)) {
          BuildMI(*MBB, IP, PPC32::ADDI, 2, nextBasePtrReg).addReg(basePtrReg)
-          .addSImm(cgo->size->getValue());
+          .addSImm(cgo.size->getValue());
        } else {
-        unsigned Op1r = getReg(cgo->size, MBB, IP);
+        unsigned Op1r = getReg(cgo.size, MBB, IP);
          BuildMI(*MBB, IP, PPC32::ADD, 2, nextBasePtrReg).addReg(basePtrReg)
            .addReg(Op1r);
        }
@@ -2916,6 +3225,15 @@ void ISel::emitGEPOperation(MachineBasicBlock *MBB,
    // Add the current base register plus any accumulated constant value
    ConstantSInt *remainder = ConstantSInt::get(Type::IntTy, constValue);
    
+  // If we are emitting this during a fold, copy the current base register to
+  // the target, and save the current constant offset so the folding load or
+  // store can try and use it as an immediate.
+  if (GEPIsFolded) {
+    BuildMI (BB, PPC32::OR, 2, TargetReg).addReg(basePtrReg).addReg(basePtrReg);
+    *RemainderPtr = remainder;
+    return;
+  }
+  
    // After we have processed all the indices, the result is left in
    // basePtrReg.  Move it to the register where we were expected to
    // put the answer.
author	Misha Brukman <brukman+llvm@gmail.com>
	Mon, 26 Jul 2004 18:13:24 +0000 (18:13 +0000)
committer	Misha Brukman <brukman+llvm@gmail.com>
	Mon, 26 Jul 2004 18:13:24 +0000 (18:13 +0000)
lib/Target/PowerPC/PPC32ISelSimple.cpp		patch \| blob \| history
lib/Target/PowerPC/PowerPCISelSimple.cpp		patch \| blob \| history