* Use the MachineConstantPool for storing constants instead of a hash_set;
[oota-llvm.git] / lib / Target / SparcV9 / SparcV9InstrSelection.cpp
index 58c15edc29f239ac39de865caf7c02781f253b59..b377658b9c8113717365c23ed5e2193d49021e60 100644 (file)
@@ -1,30 +1,36 @@
 //===-- SparcInstrSelection.cpp -------------------------------------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
 //
 //  BURS instruction selection for SPARC V9 architecture.      
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcInternals.h"
 #include "SparcInstrSelectionSupport.h"
+#include "SparcInternals.h"
 #include "SparcRegClassInfo.h"
-#include "llvm/CodeGen/InstrSelectionSupport.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineInstrAnnot.h"
+#include "llvm/Constants.h"
+#include "llvm/ConstantHandling.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
 #include "llvm/CodeGen/InstrForest.h"
 #include "llvm/CodeGen/InstrSelection.h"
+#include "llvm/CodeGen/InstrSelectionSupport.h"
+#include "llvm/CodeGen/MachineCodeForInstruction.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineCodeForInstruction.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/iTerminators.h"
-#include "llvm/iMemory.h"
-#include "llvm/iOther.h"
-#include "llvm/Function.h"
-#include "llvm/Constants.h"
-#include "llvm/ConstantHandling.h"
-#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrAnnot.h"
 #include "Support/MathExtras.h"
-#include <math.h>
+#include <algorithm>
+#include <cmath>
 
 static inline void Add3OperandInstr(unsigned Opcode, InstructionNode* Node,
                                     std::vector<MachineInstr*>& mvec) {
@@ -34,25 +40,6 @@ static inline void Add3OperandInstr(unsigned Opcode, InstructionNode* Node,
 }
 
 
-
-//---------------------------------------------------------------------------
-// Function: GetMemInstArgs
-// 
-// Purpose:
-//   Get the pointer value and the index vector for a memory operation
-//   (GetElementPtr, Load, or Store).  If all indices of the given memory
-//   operation are constant, fold in constant indices in a chain of
-//   preceding GetElementPtr instructions (if any), and return the
-//   pointer value of the first instruction in the chain.
-//   All folded instructions are marked so no code is generated for them.
-//
-// Return values:
-//   Returns the pointer Value to use.
-//   Returns the resulting IndexVector in idxVec.
-//   Returns true/false in allConstantIndices if all indices are/aren't const.
-//---------------------------------------------------------------------------
-
-
 //---------------------------------------------------------------------------
 // Function: FoldGetElemChain
 // 
@@ -178,7 +165,7 @@ GetGEPInstArgs(InstructionNode* gepNode,
   Value* ptrVal = gepI->getPointerOperand();
   InstrTreeNode* ptrChild = gepNode->leftChild(); 
 
-  // Extract the index vector of the GEP instructin.
+  // Extract the index vector of the GEP instruction.
   // If all indices are constant and first index is zero, try to fold
   // in preceding GEPs with all constant indices.
   for (User::op_iterator OI=gepI->idx_begin(),  OE=gepI->idx_end();
@@ -386,16 +373,14 @@ GetTmpForCC(Value* boolVal, const Function *F, const Type* ccType,
 
 static inline MachineOpCode 
 ChooseBccInstruction(const InstructionNode* instrNode,
-                     bool& isFPBranch)
+                     const Type*& setCCType)
 {
   InstructionNode* setCCNode = (InstructionNode*) instrNode->leftChild();
   assert(setCCNode->getOpLabel() == SetCCOp);
   BinaryOperator* setCCInstr =cast<BinaryOperator>(setCCNode->getInstruction());
-  const Type* setCCType = setCCInstr->getOperand(0)->getType();
+  setCCType = setCCInstr->getOperand(0)->getType();
   
-  isFPBranch = setCCType->isFloatingPoint(); // Return value: don't delete!
-  
-  if (isFPBranch)
+  if (setCCType->isFloatingPoint())
     return ChooseBFpccInstruction(instrNode, setCCInstr);
   else
     return ChooseBpccInstruction(instrNode, setCCInstr);
@@ -431,13 +416,8 @@ ChooseMovFpcciInstruction(const InstructionNode* instrNode)
 }
 
 
-// Assumes that SUBcc v1, v2 -> v3 has been executed.
-// In most cases, we want to clear v3 and then follow it by instruction
-// MOVcc 1 -> v3.
-// Set mustClearReg=false if v3 need not be cleared before conditional move.
-// Set valueToMove=0 if we want to conditionally move 0 instead of 1
-//                      (i.e., we want to test inverse of a condition)
-// (The latter two cases do not seem to arise because SetNE needs nothing.)
+// ChooseMovpcciForSetCC -- Choose a conditional-move instruction
+// based on the type of SetCC operation.
 // 
 // WARNING: since this function has only one caller, it always returns
 // the opcode that expects an immediate and a register. If this function
@@ -446,84 +426,119 @@ ChooseMovFpcciInstruction(const InstructionNode* instrNode)
 //
 // It will be necessary to expand convertOpcodeFromRegToImm() to handle the
 // new cases of opcodes.
+// 
+static MachineOpCode
+ChooseMovpcciForSetCC(const InstructionNode* instrNode)
+{
+  MachineOpCode opCode = V9::INVALID_OPCODE;
+
+  const Type* opType = instrNode->leftChild()->getValue()->getType();
+  assert(opType->isIntegral() || isa<PointerType>(opType));
+  bool noSign = opType->isUnsigned() || isa<PointerType>(opType);
+  
+  switch(instrNode->getInstruction()->getOpcode())
+  {
+  case Instruction::SetEQ: opCode = V9::MOVEi;                        break;
+  case Instruction::SetLE: opCode = noSign? V9::MOVLEUi : V9::MOVLEi; break;
+  case Instruction::SetGE: opCode = noSign? V9::MOVCCi  : V9::MOVGEi; break;
+  case Instruction::SetLT: opCode = noSign? V9::MOVCSi  : V9::MOVLi;  break;
+  case Instruction::SetGT: opCode = noSign? V9::MOVGUi  : V9::MOVGi;  break;
+  case Instruction::SetNE: opCode = V9::MOVNEi;                       break;
+  default: assert(0 && "Unrecognized LLVM instr!"); break; 
+  }
+  
+  return opCode;
+}
+
+
+// ChooseMovpregiForSetCC -- Choose a conditional-move-on-register-value
+// instruction based on the type of SetCC operation.  These instructions
+// compare a register with 0 and perform the move is the comparison is true.
+// 
+// WARNING: like the previous function, this function it always returns
+// the opcode that expects an immediate and a register.  See above.
+// 
 static MachineOpCode
-ChooseMovpcciAfterSub(const InstructionNode* instrNode)
+ChooseMovpregiForSetCC(const InstructionNode* instrNode)
 {
   MachineOpCode opCode = V9::INVALID_OPCODE;
   
   switch(instrNode->getInstruction()->getOpcode())
   {
-  case Instruction::SetEQ: opCode = V9::MOVEi;  break;
-  case Instruction::SetLE: opCode = V9::MOVLEi; break;
-  case Instruction::SetGE: opCode = V9::MOVGEi; break;
-  case Instruction::SetLT: opCode = V9::MOVLi;  break;
-  case Instruction::SetGT: opCode = V9::MOVGi;  break;
-  case Instruction::SetNE: opCode = V9::MOVNEi; break;
+  case Instruction::SetEQ: opCode = V9::MOVRZi;  break;
+  case Instruction::SetLE: opCode = V9::MOVRLEZi; break;
+  case Instruction::SetGE: opCode = V9::MOVRGEZi; break;
+  case Instruction::SetLT: opCode = V9::MOVRLZi;  break;
+  case Instruction::SetGT: opCode = V9::MOVRGZi;  break;
+  case Instruction::SetNE: opCode = V9::MOVRNZi; break;
   default: assert(0 && "Unrecognized VM instr!"); break; 
   }
   
   return opCode;
 }
 
+
 static inline MachineOpCode
-ChooseConvertToFloatInstr(OpLabel vopCode, const Type* opType)
+ChooseConvertToFloatInstr(const TargetMachine& target,
+                          OpLabel vopCode, const Type* opType)
 {
   assert((vopCode == ToFloatTy || vopCode == ToDoubleTy) &&
          "Unrecognized convert-to-float opcode!");
+  assert((opType->isIntegral() || opType->isFloatingPoint() ||
+          isa<PointerType>(opType))
+         && "Trying to convert a non-scalar type to FLOAT/DOUBLE?");
 
   MachineOpCode opCode = V9::INVALID_OPCODE;
-  
-  if (opType == Type::SByteTy || opType == Type::UByteTy ||
-      opType == Type::ShortTy || opType == Type::UShortTy ||
-      opType == Type::IntTy   || opType == Type::UIntTy)
-      opCode = (vopCode == ToFloatTy? V9::FITOS : V9::FITOD);
-  else if (opType == Type::LongTy || opType == Type::ULongTy ||
-           isa<PointerType>(opType))
-      opCode = (vopCode == ToFloatTy? V9::FXTOS : V9::FXTOD);
-  else if (opType == Type::FloatTy)
-      opCode = (vopCode == ToFloatTy? V9::INVALID_OPCODE : V9::FSTOD);
+
+  unsigned opSize = target.getTargetData().getTypeSize(opType);
+
+  if (opType == Type::FloatTy)
+    opCode = (vopCode == ToFloatTy? V9::NOP : V9::FSTOD);
   else if (opType == Type::DoubleTy)
-      opCode = (vopCode == ToFloatTy? V9::FDTOS : V9::INVALID_OPCODE);
-  else
-    assert(0 && "Trying to convert a non-scalar type to DOUBLE?");
+    opCode = (vopCode == ToFloatTy? V9::FDTOS : V9::NOP);
+  else if (opSize <= 4)
+    opCode = (vopCode == ToFloatTy? V9::FITOS : V9::FITOD);
+  else {
+    assert(opSize == 8 && "Unrecognized type size > 4 and < 8!");
+    opCode = (vopCode == ToFloatTy? V9::FXTOS : V9::FXTOD);
+  }
   
   return opCode;
 }
 
 static inline MachineOpCode 
-ChooseConvertFPToIntInstr(Type::PrimitiveID tid, const Type* opType)
+ChooseConvertFPToIntInstr(const TargetMachine& target,
+                          const Type* destType, const Type* opType)
 {
-  MachineOpCode opCode = V9::INVALID_OPCODE;;
-
   assert((opType == Type::FloatTy || opType == Type::DoubleTy)
          && "This function should only be called for FLOAT or DOUBLE");
+  assert((destType->isIntegral() || isa<PointerType>(destType))
+         && "Trying to convert FLOAT/DOUBLE to a non-scalar type?");
 
-  // SPARC does not have a float-to-uint conversion, only a float-to-int.
-  // For converting an FP value to uint32_t, we first need to convert to
-  // uint64_t and then to uint32_t, or we may overflow the signed int
-  // representation even for legal uint32_t values.  This expansion is
-  // done by the Preselection pass.
-  // 
-  if (tid == Type::UIntTyID) {
-    assert(tid != Type::UIntTyID && "FP-to-uint conversions must be expanded"
-           " into FP->long->uint for SPARC v9:  SO RUN PRESELECTION PASS!");
-  } else if (tid == Type::SByteTyID || tid == Type::ShortTyID || 
-             tid == Type::IntTyID   || tid == Type::UByteTyID ||
-             tid == Type::UShortTyID) {
+  MachineOpCode opCode = V9::INVALID_OPCODE;
+
+  unsigned destSize = target.getTargetData().getTypeSize(destType);
+
+  if (destType == Type::UIntTy)
+    assert(destType != Type::UIntTy && "Expand FP-to-uint beforehand.");
+  else if (destSize <= 4)
     opCode = (opType == Type::FloatTy)? V9::FSTOI : V9::FDTOI;
-  } else if (tid == Type::LongTyID || tid == Type::ULongTyID) {
-      opCode = (opType == Type::FloatTy)? V9::FSTOX : V9::FDTOX;
-  } else
-    assert(0 && "Should not get here, Mo!");
+  else {
+    assert(destSize == 8 && "Unrecognized type size > 4 and < 8!");
+    opCode = (opType == Type::FloatTy)? V9::FSTOX : V9::FDTOX;
+  }
 
   return opCode;
 }
 
-MachineInstr*
-CreateConvertFPToIntInstr(Type::PrimitiveID destTID,
-                          Value* srcVal, Value* destVal)
+static MachineInstr*
+CreateConvertFPToIntInstr(const TargetMachine& target,
+                          Value* srcVal,
+                          Value* destVal,
+                          const Type* destType)
 {
-  MachineOpCode opCode = ChooseConvertFPToIntInstr(destTID, srcVal->getType());
+  MachineOpCode opCode = ChooseConvertFPToIntInstr(target, destType,
+                                                   srcVal->getType());
   assert(opCode != V9::INVALID_OPCODE && "Expected to need conversion!");
   return BuildMI(opCode, 2).addReg(srcVal).addRegDef(destVal);
 }
@@ -531,19 +546,11 @@ CreateConvertFPToIntInstr(Type::PrimitiveID destTID,
 // CreateCodeToConvertFloatToInt: Convert FP value to signed or unsigned integer
 // The FP value must be converted to the dest type in an FP register,
 // and the result is then copied from FP to int register via memory.
-//
+// SPARC does not have a float-to-uint conversion, only a float-to-int (fdtoi).
 // Since fdtoi converts to signed integers, any FP value V between MAXINT+1
-// and MAXUNSIGNED (i.e., 2^31 <= V <= 2^32-1) would be converted incorrectly
-// *only* when converting to an unsigned.  (Unsigned byte, short or long
-// don't have this problem.)
-// For unsigned int, we therefore have to generate the code sequence:
-// 
-//      if (V > (float) MAXINT) {
-//        unsigned result = (unsigned) (V  - (float) MAXINT);
-//        result = result + (unsigned) MAXINT;
-//      }
-//      else
-//        result = (unsigned) V;
+// and MAXUNSIGNED (i.e., 2^31 <= V <= 2^32-1) would be converted incorrectly.
+// Therefore, for converting an FP value to uint32_t, we first need to convert
+// to uint64_t and then to uint32_t.
 // 
 static void
 CreateCodeToConvertFloatToInt(const TargetMachine& target,
@@ -552,24 +559,46 @@ CreateCodeToConvertFloatToInt(const TargetMachine& target,
                               std::vector<MachineInstr*>& mvec,
                               MachineCodeForInstruction& mcfi)
 {
+  Function* F = destI->getParent()->getParent();
+
   // Create a temporary to represent the FP register into which the
   // int value will placed after conversion.  The type of this temporary
   // depends on the type of FP register to use: single-prec for a 32-bit
   // int or smaller; double-prec for a 64-bit int.
   // 
   size_t destSize = target.getTargetData().getTypeSize(destI->getType());
-  const Type* destTypeToUse = (destSize > 4)? Type::DoubleTy : Type::FloatTy;
-  TmpInstruction* destForCast = new TmpInstruction(mcfi, destTypeToUse, opVal);
 
-  // Create the fp-to-int conversion code
-  MachineInstr* M =CreateConvertFPToIntInstr(destI->getType()->getPrimitiveID(),
-                                             opVal, destForCast);
-  mvec.push_back(M);
+  const Type* castDestType = destI->getType(); // type for the cast instr result
+  const Type* castDestRegType;          // type for cast instruction result reg
+  TmpInstruction* destForCast;          // dest for cast instruction
+  Instruction* fpToIntCopyDest = destI; // dest for fp-reg-to-int-reg copy instr
+
+  // For converting an FP value to uint32_t, we first need to convert to
+  // uint64_t and then to uint32_t, as explained above.
+  if (destI->getType() == Type::UIntTy) {
+    castDestType    = Type::ULongTy;       // use this instead of type of destI
+    castDestRegType = Type::DoubleTy;      // uint64_t needs 64-bit FP register.
+    destForCast     = new TmpInstruction(mcfi, castDestRegType, opVal);
+    fpToIntCopyDest = new TmpInstruction(mcfi, castDestType, destForCast);
+  }
+  else {
+    castDestRegType = (destSize > 4)? Type::DoubleTy : Type::FloatTy;
+    destForCast = new TmpInstruction(mcfi, castDestRegType, opVal);
+  }
+
+  // Create the fp-to-int conversion instruction (src and dest regs are FP regs)
+  mvec.push_back(CreateConvertFPToIntInstr(target, opVal, destForCast,
+                                           castDestType));
 
   // Create the fpreg-to-intreg copy code
-  target.getInstrInfo().
-    CreateCodeToCopyFloatToInt(target, destI->getParent()->getParent(),
-                               destForCast, destI, mvec, mcfi);
+  target.getInstrInfo().CreateCodeToCopyFloatToInt(target, F, destForCast,
+                                                   fpToIntCopyDest, mvec, mcfi);
+
+  // Create the uint64_t to uint32_t conversion, if needed
+  if (destI->getType() == Type::UIntTy)
+    target.getInstrInfo().
+      CreateZeroExtensionInstructions(target, F, fpToIntCopyDest, destI,
+                                      /*numLowBits*/ 32, mvec, mcfi);
 }
 
 
@@ -796,7 +825,8 @@ CreateMulConstInstruction(const TargetMachine &target, Function* F,
   
   if (resultType->isInteger() || isa<PointerType>(resultType)) {
     bool isValidConst;
-    int64_t C = GetConstantValueAsSignedInt(constOp, isValidConst);
+    int64_t C = (int64_t) target.getInstrInfo().ConvertConstantToIntType(target,
+                                     constOp, constOp->getType(), isValidConst);
     if (isValidConst) {
       unsigned pow;
       bool needNeg = false;
@@ -949,53 +979,72 @@ CreateDivConstInstruction(TargetMachine &target,
   if (resultType->isInteger()) {
     unsigned pow;
     bool isValidConst;
-    int64_t C = GetConstantValueAsSignedInt(constOp, isValidConst);
+    int64_t C = (int64_t) target.getInstrInfo().ConvertConstantToIntType(target,
+                                     constOp, constOp->getType(), isValidConst);
     if (isValidConst) {
       bool needNeg = false;
       if (C < 0) {
         needNeg = true;
         C = -C;
       }
-          
+      
       if (C == 1) {
         mvec.push_back(BuildMI(V9::ADDr, 3).addReg(LHS).addMReg(ZeroReg)
                        .addRegDef(destVal));
       } else if (isPowerOf2(C, pow)) {
         unsigned opCode;
         Value* shiftOperand;
+        unsigned opSize = target.getTargetData().getTypeSize(resultType);
 
         if (resultType->isSigned()) {
-          // The result may be negative and we need to add one before shifting 
-          // a negative value.  Use:
-          //      srl i0, 31, x0; add x0, i0, i1         (if i0 is <= 32 bits)
-          // or
-          //      srlx i0, 63, x0; add x0, i0, i1         (if i0 is 64 bits)
-          // to compute i1=i0+1 if i0 < 0 and i1=i0 otherwise.  
+          // For N / 2^k, if the operand N is negative,
+          // we need to add (2^k - 1) before right-shifting by k, i.e.,
           // 
-          TmpInstruction *srlTmp, *addTmp;
+          //    (N / 2^k) = N >> k,               if N >= 0;
+          //                (N + 2^k - 1) >> k,   if N < 0
+          // 
+          // If N is <= 32 bits, use:
+          //    sra N, 31, t1           // t1 = ~0,         if N < 0,  0 else
+          //    srl t1, 32-k, t2        // t2 = 2^k - 1,    if N < 0,  0 else
+          //    add t2, N, t3           // t3 = N + 2^k -1, if N < 0,  N else
+         //    sra t3, k, result       // result = N / 2^k
+          // 
+          // If N is 64 bits, use:
+          //    srax N,  k-1,  t1       // t1 = sign bit in high k positions
+          //    srlx t1, 64-k, t2       // t2 = 2^k - 1,    if N < 0,  0 else
+          //    add t2, N, t3           // t3 = N + 2^k -1, if N < 0,  N else
+         //    sra t3, k, result       // result = N / 2^k
+          //
+          TmpInstruction *sraTmp, *srlTmp, *addTmp;
           MachineCodeForInstruction& mcfi
             = MachineCodeForInstruction::get(destVal);
-          srlTmp = new TmpInstruction(mcfi, resultType, LHS, 0, "getSign");
+          sraTmp = new TmpInstruction(mcfi, resultType, LHS, 0, "getSign");
+          srlTmp = new TmpInstruction(mcfi, resultType, LHS, 0, "getPlus2km1");
           addTmp = new TmpInstruction(mcfi, resultType, LHS, srlTmp,"incIfNeg");
 
-          // Create the SRL or SRLX instruction to get the sign bit
-          mvec.push_back(BuildMI((resultType==Type::LongTy) ?
-                                 V9::SRLXi6 : V9::SRLi5, 3)
+          // Create the SRA or SRAX instruction to get the sign bit
+          mvec.push_back(BuildMI((opSize > 4)? V9::SRAXi6 : V9::SRAi5, 3)
                          .addReg(LHS)
-                         .addSImm((resultType==Type::LongTy)? 63 : 31)
+                         .addSImm((resultType==Type::LongTy)? pow-1 : 31)
+                         .addRegDef(sraTmp));
+
+          // Create the SRL or SRLX instruction to get the sign bit
+          mvec.push_back(BuildMI((opSize > 4)? V9::SRLXi6 : V9::SRLi5, 3)
+                         .addReg(sraTmp)
+                         .addSImm((resultType==Type::LongTy)? 64-pow : 32-pow)
                          .addRegDef(srlTmp));
 
-          // Create the ADD instruction to add 1 for negative values
+          // Create the ADD instruction to add 2^pow-1 for negative values
           mvec.push_back(BuildMI(V9::ADDr, 3).addReg(LHS).addReg(srlTmp)
                          .addRegDef(addTmp));
 
           // Get the shift operand and "right-shift" opcode to do the divide
           shiftOperand = addTmp;
-          opCode = (resultType==Type::LongTy) ? V9::SRAXi6 : V9::SRAi5;
+          opCode = (opSize > 4)? V9::SRAXi6 : V9::SRAi5;
         } else {
           // Get the shift operand and "right-shift" opcode to do the divide
           shiftOperand = LHS;
-          opCode = (resultType==Type::LongTy) ? V9::SRLXi6 : V9::SRLi5;
+          opCode = (opSize > 4)? V9::SRLXi6 : V9::SRLi5;
         }
 
         // Now do the actual shift!
@@ -1039,7 +1088,9 @@ CreateCodeForVariableSizeAlloca(const TargetMachine& target,
   // compile time if the total size is a known constant.
   if (isa<Constant>(numElementsVal)) {
     bool isValid;
-    int64_t numElem = GetConstantValueAsSignedInt(numElementsVal, isValid);
+    int64_t numElem = (int64_t) target.getInstrInfo().
+      ConvertConstantToIntType(target, numElementsVal,
+                               numElementsVal->getType(), isValid);
     assert(isValid && "Unexpectedly large array dimension in alloca!");
     int64_t total = numElem * tsize;
     if (int extra= total % target.getFrameInfo().getStackFrameSizeAlignment())
@@ -1317,11 +1368,8 @@ ForwardOperand(InstructionNode* treeNode,
       }
           
       for (unsigned i=0,numOps=minstr->getNumImplicitRefs(); i<numOps; ++i)
-        if (minstr->getImplicitRef(i) == unusedOp) {
-          minstr->setImplicitRef(i, fwdOp,
-                                 minstr->getImplicitOp(i).opIsDefOnly(),
-                                 minstr->getImplicitOp(i).opIsDefAndUse());
-        }
+        if (minstr->getImplicitRef(i) == unusedOp)
+          minstr->setImplicitRef(i, fwdOp);
     }
   }
 }
@@ -1348,8 +1396,7 @@ bool CodeGenIntrinsic(LLVMIntrinsic::ID iid, CallInst &callInstr,
 {
   switch (iid) {
   case LLVMIntrinsic::va_start: {
-    // Get the address of the first vararg value on stack and copy it to
-    // the argument of va_start(va_list* ap).
+    // Get the address of the first incoming vararg argument on the stack
     bool ignore;
     Function* func = cast<Function>(callInstr.getParent()->getParent());
     int numFixedArgs   = func->getFunctionType()->getNumParams();
@@ -1358,7 +1405,7 @@ bool CodeGenIntrinsic(LLVMIntrinsic::ID iid, CallInst &callInstr,
     int firstVarArgOff = numFixedArgs * argSize + target.getFrameInfo().
       getFirstIncomingArgOffset(MachineFunction::get(func), ignore);
     mvec.push_back(BuildMI(V9::ADDi, 3).addMReg(fpReg).addSImm(firstVarArgOff).
-                   addReg(callInstr.getOperand(1)));
+                   addRegDef(&callInstr));
     return true;
   }
 
@@ -1366,12 +1413,50 @@ bool CodeGenIntrinsic(LLVMIntrinsic::ID iid, CallInst &callInstr,
     return true;                        // no-op on Sparc
 
   case LLVMIntrinsic::va_copy:
-    // Simple copy of current va_list (arg2) to new va_list (arg1)
+    // Simple copy of current va_list (arg1) to new va_list (result)
     mvec.push_back(BuildMI(V9::ORr, 3).
                    addMReg(target.getRegInfo().getZeroRegNum()).
-                   addReg(callInstr.getOperand(2)).
-                   addReg(callInstr.getOperand(1)));
+                   addReg(callInstr.getOperand(1)).
+                   addRegDef(&callInstr));
+    return true;
+
+  case LLVMIntrinsic::sigsetjmp:
+  case LLVMIntrinsic::setjmp: {
+    // act as if we return 0
+    unsigned g0 = target.getRegInfo().getZeroRegNum();
+    mvec.push_back(BuildMI(V9::ORr,3).addMReg(g0).addMReg(g0)
+                   .addReg(&callInstr, MOTy::Def));
     return true;
+  }
+
+  case LLVMIntrinsic::siglongjmp:
+  case LLVMIntrinsic::longjmp: {
+    // call abort()
+    Module* M = callInstr.getParent()->getParent()->getParent();
+    const FunctionType *voidvoidFuncTy =
+      FunctionType::get(Type::VoidTy, std::vector<const Type*>(), false);
+    Function *F = M->getOrInsertFunction("abort", voidvoidFuncTy);
+    assert(F && "Unable to get or create `abort' function declaration");
+
+    // Create hidden virtual register for return address with type void*
+    TmpInstruction* retAddrReg =
+      new TmpInstruction(MachineCodeForInstruction::get(&callInstr),
+                         PointerType::get(Type::VoidTy), &callInstr);
+    
+    // Use a descriptor to pass information about call arguments
+    // to the register allocator.  This descriptor will be "owned"
+    // and freed automatically when the MachineCodeForInstruction
+    // object for the callInstr goes away.
+    CallArgsDescriptor* argDesc =
+      new CallArgsDescriptor(&callInstr, retAddrReg, false, false);
+
+    MachineInstr* callMI = BuildMI(V9::CALL, 1).addPCDisp(F);
+    callMI->addImplicitRef(retAddrReg, /*isDef*/ true);
+    
+    mvec.push_back(callMI);
+    mvec.push_back(BuildMI(V9::NOP, 0));
+    return true;
+  }
 
   default:
     return false;
@@ -1443,6 +1528,7 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
   unsigned allocaSize = 0;
   MachineInstr* M, *M2;
   unsigned L;
+  bool foldCase = false;
 
   mvec.clear(); 
   
@@ -1455,22 +1541,19 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
   // Let's check for chain rules outside the switch so that we don't have
   // to duplicate the list of chain rule production numbers here again
   // 
-  if (ThisIsAChainRule(ruleForNode))
-    {
-      // Chain rules have a single nonterminal on the RHS.
-      // Get the rule that matches the RHS non-terminal and use that instead.
-      // 
-      assert(nts[0] && ! nts[1]
-             && "A chain rule should have only one RHS non-terminal!");
-      nextRule = burm_rule(subtreeRoot->state, nts[0]);
-      nts = burm_nts[nextRule];
-      GetInstructionsByRule(subtreeRoot, nextRule, nts, target, mvec);
-    }
-  else
-    {
-      switch(ruleForNode) {
-      case 1:  // stmt:   Ret
-      case 2:  // stmt:   RetValue(reg)
+  if (ThisIsAChainRule(ruleForNode)) {
+    // Chain rules have a single nonterminal on the RHS.
+    // Get the rule that matches the RHS non-terminal and use that instead.
+    // 
+    assert(nts[0] && ! nts[1]
+           && "A chain rule should have only one RHS non-terminal!");
+    nextRule = burm_rule(subtreeRoot->state, nts[0]);
+    nts = burm_nts[nextRule];
+    GetInstructionsByRule(subtreeRoot, nextRule, nts, target, mvec);
+  } else {
+    switch(ruleForNode) {
+      case 1:   // stmt:   Ret
+      case 2:   // stmt:   RetValue(reg)
       {         // NOTE: Prepass of register allocation is responsible
                 //      for moving return value to appropriate register.
                 // Copy the return value to the required return register.
@@ -1491,9 +1574,11 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
           BuildMI(V9::JMPLRETi, 3).addReg(returnAddrTmp).addSImm(8)
           .addMReg(target.getRegInfo().getZeroRegNum(), MOTy::Def);
       
-        // Insert a copy to copy the return value to the appropriate register
-        // -- For FP values, create a FMOVS or FMOVD instruction
-        // -- For non-FP values, create an add-with-0 instruction
+        // If there is a value to return, we need to:
+        // (a) Sign-extend the value if it is smaller than 8 bytes (reg size)
+        // (b) Insert a copy to copy the return value to the appropriate reg.
+        //     -- For FP values, create a FMOVS or FMOVD instruction
+        //     -- For non-FP values, create an add-with-0 instruction
         // 
         if (retVal != NULL) {
           const UltraSparcRegInfo& regInfo =
@@ -1505,19 +1590,39 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
                                 : (unsigned) SparcIntRegClass::i0);
           retRegNum = regInfo.getUnifiedRegNum(regClassID, retRegNum);
 
-          // Create a virtual register to represent it and mark
-          // this vreg as being an implicit operand of the ret MI
+          // () Insert sign-extension instructions for small signed values.
+          // 
+          Value* retValToUse = retVal;
+          if (retType->isIntegral() && retType->isSigned()) {
+            unsigned retSize = target.getTargetData().getTypeSize(retType);
+            if (retSize <= 4) {
+              // create a temporary virtual reg. to hold the sign-extension
+              retValToUse = new TmpInstruction(mcfi, retVal);
+
+              // sign-extend retVal and put the result in the temporary reg.
+              target.getInstrInfo().CreateSignExtensionInstructions
+                (target, returnInstr->getParent()->getParent(),
+                 retVal, retValToUse, 8*retSize, mvec, mcfi);
+            }
+          }
+
+          // (b) Now, insert a copy to to the appropriate register:
+          //     -- For FP values, create a FMOVS or FMOVD instruction
+          //     -- For non-FP values, create an add-with-0 instruction
+          // 
+          // First, create a virtual register to represent the register and
+          // mark this vreg as being an implicit operand of the ret MI.
           TmpInstruction* retVReg = 
-            new TmpInstruction(mcfi, retVal, NULL, "argReg");
-            
+            new TmpInstruction(mcfi, retValToUse, NULL, "argReg");
+          
           retMI->addImplicitRef(retVReg);
-            
+          
           if (retType->isFloatingPoint())
             M = (BuildMI(retType==Type::FloatTy? V9::FMOVS : V9::FMOVD, 2)
-                 .addReg(retVal).addReg(retVReg, MOTy::Def));
+                 .addReg(retValToUse).addReg(retVReg, MOTy::Def));
           else
             M = (BuildMI(ChooseAddInstructionByType(retType), 3)
-                 .addReg(retVal).addSImm((int64_t) 0)
+                 .addReg(retValToUse).addSImm((int64_t) 0)
                  .addReg(retVReg, MOTy::Def));
 
           // Mark the operand with the register it should be assigned
@@ -1565,7 +1670,8 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         
         if ((constVal->getType()->isInteger()
              || isa<PointerType>(constVal->getType()))
-            && GetConstantValueAsSignedInt(constVal, isValidConst) == 0
+            && target.getInstrInfo().ConvertConstantToIntType(target,
+                             constVal, constVal->getType(), isValidConst) == 0
             && isValidConst)
           {
             // That constant is a zero after all...
@@ -1604,11 +1710,11 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         // TmpInstruction representing that CC.
         // 
         BranchInst* brInst = cast<BranchInst>(subtreeRoot->getInstruction());
-        bool isFPBranch;
-        unsigned Opcode = ChooseBccInstruction(subtreeRoot, isFPBranch);
+        const Type* setCCType;
+        unsigned Opcode = ChooseBccInstruction(subtreeRoot, setCCType);
         Value* ccValue = GetTmpForCC(subtreeRoot->leftChild()->getValue(),
                                      brInst->getParent()->getParent(),
-                                     isFPBranch? Type::FloatTy : Type::IntTy,
+                                     setCCType,
                                      MachineCodeForInstruction::get(brInst));
         M = BuildMI(Opcode, 2).addCCReg(ccValue)
                               .addPCDisp(brInst->getSuccessor(0));
@@ -1669,8 +1775,26 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         assert(0 && "VRegList should never be the topmost non-chain rule");
         break;
 
-      case 21: // bool:  Not(bool,reg): Both these are implemented as:
-      case 421:        // reg:   BNot(reg,reg):        reg = reg XOR-NOT 0
+      case 21: // bool:  Not(bool,reg): Compute with a conditional-move-on-reg
+      { // First find the unary operand. It may be left or right, usually right.
+        Instruction* notI = subtreeRoot->getInstruction();
+        Value* notArg = BinaryOperator::getNotArgument(
+                           cast<BinaryOperator>(subtreeRoot->getInstruction()));
+        unsigned ZeroReg = target.getRegInfo().getZeroRegNum();
+
+        // Unconditionally set register to 0
+        mvec.push_back(BuildMI(V9::SETHI, 2).addZImm(0).addRegDef(notI));
+
+        // Now conditionally move 1 into the register.
+        // Mark the register as a use (as well as a def) because the old
+        // value will be retained if the condition is false.
+        mvec.push_back(BuildMI(V9::MOVRZi, 3).addReg(notArg).addZImm(1)
+                       .addReg(notI, MOTy::UseAndDef));
+
+        break;
+      }
+
+      case 421:        // reg:   BNot(reg,reg): Compute as reg = reg XOR-NOT 0
       { // First find the unary operand. It may be left or right, usually right.
         Value* notArg = BinaryOperator::getNotArgument(
                            cast<BinaryOperator>(subtreeRoot->getInstruction()));
@@ -1680,11 +1804,28 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         break;
       }
 
+      case 322:        // reg:   Not(tobool, reg):
+        // Fold CAST-TO-BOOL with NOT by inverting the sense of cast-to-bool
+        foldCase = true;
+        // Just fall through!
+
       case 22: // reg:   ToBoolTy(reg):
       {
-        const Type* opType = subtreeRoot->leftChild()->getValue()->getType();
-        assert(opType->isIntegral() || isa<PointerType>(opType));
-        forwardOperandNum = 0;          // forward first operand to user
+        Instruction* castI = subtreeRoot->getInstruction();
+        Value* opVal = subtreeRoot->leftChild()->getValue();
+        assert(opVal->getType()->isIntegral() ||
+               isa<PointerType>(opVal->getType()));
+
+        // Unconditionally set register to 0
+        mvec.push_back(BuildMI(V9::SETHI, 2).addZImm(0).addRegDef(castI));
+
+        // Now conditionally move 1 into the register.
+        // Mark the register as a use (as well as a def) because the old
+        // value will be retained if the condition is false.
+        MachineOpCode opCode = foldCase? V9::MOVRZi : V9::MOVRNZi;
+        mvec.push_back(BuildMI(opCode, 3).addReg(opVal).addZImm(1)
+                       .addReg(castI, MOTy::UseAndDef));
+
         break;
       }
       
@@ -1694,6 +1835,8 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
       case 26: // reg:   ToShortTy(reg)
       case 27: // reg:   ToUIntTy(reg)
       case 28: // reg:   ToIntTy(reg)
+      case 29: // reg:   ToULongTy(reg)
+      case 30: // reg:   ToLongTy(reg)
       {
         //======================================================================
         // Rules for integer conversions:
@@ -1715,64 +1858,87 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         // 
         // Since we assume 2s complement representations, this implies:
         // 
-        // -- if operand is smaller than destination, zero-extend or sign-extend
-        //    according to the signedness of the *operand*: source decides.
-        //    ==> we have to do nothing here!
+        // -- If operand is smaller than destination, zero-extend or sign-extend
+        //    according to the signedness of the *operand*: source decides:
+        //    (1) If operand is signed, sign-extend it.
+        //        If dest is unsigned, zero-ext the result!
+        //    (2) If operand is unsigned, our current invariant is that
+        //        it's high bits are correct, so zero-extension is not needed.
         // 
-        // -- if operand is same size as or larger than destination, and the
-        //    destination is *unsigned*, zero-extend the operand: dest. decides
-        // 
-        // -- if operand is same size as or larger than destination, and the
-        //    destination is *signed*, the choice is implementation defined:
-        //    we sign-extend the operand: i.e., again dest. decides.
-        //    Note: this matches both Sun's cc and gcc3.2.
+        // -- If operand is same size as or larger than destination,
+        //    zero-extend or sign-extend according to the signedness of
+        //    the *destination*: destination decides:
+        //    (1) If destination is signed, sign-extend (truncating if needed)
+        //        This choice is implementation defined.  We sign-extend the
+        //        operand, which matches both Sun's cc and gcc3.2.
+        //    (2) If destination is unsigned, zero-extend (truncating if needed)
         //======================================================================
 
         Instruction* destI =  subtreeRoot->getInstruction();
+        Function* currentFunc = destI->getParent()->getParent();
+        MachineCodeForInstruction& mcfi=MachineCodeForInstruction::get(destI);
+
         Value* opVal = subtreeRoot->leftChild()->getValue();
         const Type* opType = opVal->getType();
-        if (opType->isIntegral() || isa<PointerType>(opType)) {
-          unsigned opSize = target.getTargetData().getTypeSize(opType);
-          unsigned destSize =
-            target.getTargetData().getTypeSize(destI->getType());
-          if (opSize >= destSize) {
-            // Operand is same size as or larger than dest:
-            // zero- or sign-extend, according to the signeddness of
-            // the destination (see above).
-            if (destI->getType()->isSigned())
-              target.getInstrInfo().CreateSignExtensionInstructions(target,
-                    destI->getParent()->getParent(), opVal, destI, 8*destSize,
-                    mvec, MachineCodeForInstruction::get(destI));
-            else
-              target.getInstrInfo().CreateZeroExtensionInstructions(target,
-                    destI->getParent()->getParent(), opVal, destI, 8*destSize,
-                    mvec, MachineCodeForInstruction::get(destI));
-          } else
-            forwardOperandNum = 0;          // forward first operand to user
+        const Type* destType = destI->getType();
+        unsigned opSize   = target.getTargetData().getTypeSize(opType);
+        unsigned destSize = target.getTargetData().getTypeSize(destType);
+        
+        bool isIntegral = opType->isIntegral() || isa<PointerType>(opType);
+
+        if (opType == Type::BoolTy ||
+            opType == destType ||
+            isIntegral && opSize == destSize && opSize == 8) {
+          // nothing to do in all these cases
+          forwardOperandNum = 0;          // forward first operand to user
+
         } else if (opType->isFloatingPoint()) {
-          CreateCodeToConvertFloatToInt(target, opVal, destI, mvec,
-                                        MachineCodeForInstruction::get(destI));
-          if (destI->getType()->isUnsigned())
+
+          CreateCodeToConvertFloatToInt(target, opVal, destI, mvec, mcfi);
+          if (destI->getType()->isUnsigned() && destI->getType() !=Type::UIntTy)
             maskUnsignedResult = true; // not handled by fp->int code
-        } else
-          assert(0 && "Unrecognized operand type for convert-to-unsigned");
 
-        break;
-      }
+        } else if (isIntegral) {
+
+          bool opSigned     = opType->isSigned();
+          bool destSigned   = destType->isSigned();
+          unsigned extSourceInBits = 8 * std::min<unsigned>(opSize, destSize);
+
+          assert(! (opSize == destSize && opSigned == destSigned) &&
+                 "How can different int types have same size and signedness?");
+
+          bool signExtend = (opSize <  destSize && opSigned ||
+                             opSize >= destSize && destSigned);
+
+          bool signAndZeroExtend = (opSize < destSize && destSize < 8u &&
+                                    opSigned && !destSigned);
+          assert(!signAndZeroExtend || signExtend);
+
+          bool zeroExtendOnly = opSize >= destSize && !destSigned;
+          assert(!zeroExtendOnly || !signExtend);
+
+          if (signExtend) {
+            Value* signExtDest = (signAndZeroExtend
+                                  ? new TmpInstruction(mcfi, destType, opVal)
+                                  : destI);
+
+            target.getInstrInfo().CreateSignExtensionInstructions
+              (target, currentFunc,opVal,signExtDest,extSourceInBits,mvec,mcfi);
+
+            if (signAndZeroExtend)
+              target.getInstrInfo().CreateZeroExtensionInstructions
+              (target, currentFunc, signExtDest, destI, 8*destSize, mvec, mcfi);
+          }
+          else if (zeroExtendOnly) {
+            target.getInstrInfo().CreateZeroExtensionInstructions
+              (target, currentFunc, opVal, destI, extSourceInBits, mvec, mcfi);
+          }
+          else
+            forwardOperandNum = 0;          // forward first operand to user
 
-      case 29: // reg:   ToULongTy(reg)
-      case 30: // reg:   ToLongTy(reg)
-      {
-        Value* opVal = subtreeRoot->leftChild()->getValue();
-        const Type* opType = opVal->getType();
-        if (opType->isIntegral() || isa<PointerType>(opType))
-          forwardOperandNum = 0;          // forward first operand to user
-        else if (opType->isFloatingPoint()) {
-          Instruction* destI =  subtreeRoot->getInstruction();
-          CreateCodeToConvertFloatToInt(target, opVal, destI, mvec,
-                                        MachineCodeForInstruction::get(destI));
         } else
-          assert(0 && "Unrecognized operand type for convert-to-signed");
+          assert(0 && "Unrecognized operand type for convert-to-integer");
+
         break;
       }
       
@@ -1797,9 +1963,9 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         if (forwardOperandNum != 0) {    // we do need the cast
           Value* leftVal = subtreeRoot->leftChild()->getValue();
           const Type* opType = leftVal->getType();
-          MachineOpCode opCode=ChooseConvertToFloatInstr(
+          MachineOpCode opCode=ChooseConvertToFloatInstr(target,
                                        subtreeRoot->getOpLabel(), opType);
-          if (opCode == V9::INVALID_OPCODE) {  // no conversion needed
+          if (opCode == V9::NOP) {      // no conversion needed
             forwardOperandNum = 0;      // forward first operand to user
           } else {
             // If the source operand is a non-FP type it must be
@@ -1916,126 +2082,242 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         // ELSE FALL THROUGH
       
       case 36: // reg:   Div(reg, reg)
+      {
         maskUnsignedResult = true;
-        Add3OperandInstr(ChooseDivInstruction(target, subtreeRoot),
-                         subtreeRoot, mvec);
+
+        // If either operand of divide is smaller than 64 bits, we have
+        // to make sure the unused top bits are correct because they affect
+        // the result.  These bits are already correct for unsigned values.
+        // They may be incorrect for signed values, so sign extend to fill in.
+        Instruction* divI = subtreeRoot->getInstruction();
+        Value* divOp1 = subtreeRoot->leftChild()->getValue();
+        Value* divOp2 = subtreeRoot->rightChild()->getValue();
+        Value* divOp1ToUse = divOp1;
+        Value* divOp2ToUse = divOp2;
+        if (divI->getType()->isSigned()) {
+          unsigned opSize=target.getTargetData().getTypeSize(divI->getType());
+          if (opSize < 8) {
+            MachineCodeForInstruction& mcfi=MachineCodeForInstruction::get(divI);
+            divOp1ToUse = new TmpInstruction(mcfi, divOp1);
+            divOp2ToUse = new TmpInstruction(mcfi, divOp2);
+            target.getInstrInfo().
+              CreateSignExtensionInstructions(target,
+                                              divI->getParent()->getParent(),
+                                              divOp1, divOp1ToUse,
+                                              8*opSize, mvec, mcfi);
+            target.getInstrInfo().
+              CreateSignExtensionInstructions(target,
+                                              divI->getParent()->getParent(),
+                                              divOp2, divOp2ToUse,
+                                              8*opSize, mvec, mcfi);
+          }
+        }
+
+        mvec.push_back(BuildMI(ChooseDivInstruction(target, subtreeRoot), 3)
+                       .addReg(divOp1ToUse)
+                       .addReg(divOp2ToUse)
+                       .addRegDef(divI));
+
         break;
+      }
 
       case  37:        // reg:   Rem(reg, reg)
       case 237:        // reg:   Rem(reg, Constant)
       {
         maskUnsignedResult = true;
-        Instruction* remInstr = subtreeRoot->getInstruction();
-
-        MachineCodeForInstruction& mcfi=MachineCodeForInstruction::get(remInstr);
-        TmpInstruction* quot = new TmpInstruction(mcfi,
-                                        subtreeRoot->leftChild()->getValue(),
-                                        subtreeRoot->rightChild()->getValue());
-        TmpInstruction* prod = new TmpInstruction(mcfi,
-                                        quot,
-                                        subtreeRoot->rightChild()->getValue());
+
+        Instruction* remI   = subtreeRoot->getInstruction();
+        Value* divOp1 = subtreeRoot->leftChild()->getValue();
+        Value* divOp2 = subtreeRoot->rightChild()->getValue();
+
+        MachineCodeForInstruction& mcfi = MachineCodeForInstruction::get(remI);
         
-        M = BuildMI(ChooseDivInstruction(target, subtreeRoot), 3)
-                             .addReg(subtreeRoot->leftChild()->getValue())
-                             .addReg(subtreeRoot->rightChild()->getValue())
-                             .addRegDef(quot);
-        mvec.push_back(M);
+        // If second operand of divide is smaller than 64 bits, we have
+        // to make sure the unused top bits are correct because they affect
+        // the result.  These bits are already correct for unsigned values.
+        // They may be incorrect for signed values, so sign extend to fill in.
+        // 
+        Value* divOpToUse = divOp2;
+        if (divOp2->getType()->isSigned()) {
+          unsigned opSize=target.getTargetData().getTypeSize(divOp2->getType());
+          if (opSize < 8) {
+            divOpToUse = new TmpInstruction(mcfi, divOp2);
+            target.getInstrInfo().
+              CreateSignExtensionInstructions(target,
+                                              remI->getParent()->getParent(),
+                                              divOp2, divOpToUse,
+                                              8*opSize, mvec, mcfi);
+          }
+        }
+
+        // Now compute: result = rem V1, V2 as:
+        //      result = V1 - (V1 / signExtend(V2)) * signExtend(V2)
+        // 
+        TmpInstruction* quot = new TmpInstruction(mcfi, divOp1, divOpToUse);
+        TmpInstruction* prod = new TmpInstruction(mcfi, quot, divOpToUse);
+
+        mvec.push_back(BuildMI(ChooseDivInstruction(target, subtreeRoot), 3)
+                       .addReg(divOp1).addReg(divOpToUse).addRegDef(quot));
         
-        unsigned MulOpcode =
-          ChooseMulInstructionByType(subtreeRoot->getInstruction()->getType());
-        Value *MulRHS = subtreeRoot->rightChild()->getValue();
-        M = BuildMI(MulOpcode, 3).addReg(quot).addReg(MulRHS).addReg(prod,
-                                                                     MOTy::Def);
-        mvec.push_back(M);
+        mvec.push_back(BuildMI(ChooseMulInstructionByType(remI->getType()), 3)
+                       .addReg(quot).addReg(divOpToUse).addRegDef(prod));
+        
+        mvec.push_back(BuildMI(ChooseSubInstructionByType(remI->getType()), 3)
+                       .addReg(divOp1).addReg(prod).addRegDef(remI));
         
-        unsigned Opcode = ChooseSubInstructionByType(
-                                   subtreeRoot->getInstruction()->getType());
-        M = BuildMI(Opcode, 3).addReg(subtreeRoot->leftChild()->getValue())
-                              .addReg(prod).addRegDef(subtreeRoot->getValue());
-        mvec.push_back(M);
         break;
       }
       
       case  38:        // bool:   And(bool, bool)
+      case 138:        // bool:   And(bool, not)
       case 238:        // bool:   And(bool, boolconst)
       case 338:        // reg :   BAnd(reg, reg)
       case 538:        // reg :   BAnd(reg, Constant)
         Add3OperandInstr(V9::ANDr, subtreeRoot, mvec);
         break;
 
-      case 138:        // bool:   And(bool, not)
       case 438:        // bool:   BAnd(bool, bnot)
       { // Use the argument of NOT as the second argument!
         // Mark the NOT node so that no code is generated for it.
+        // If the type is boolean, set 1 or 0 in the result register.
         InstructionNode* notNode = (InstructionNode*) subtreeRoot->rightChild();
         Value* notArg = BinaryOperator::getNotArgument(
                            cast<BinaryOperator>(notNode->getInstruction()));
         notNode->markFoldedIntoParent();
-        Value *LHS = subtreeRoot->leftChild()->getValue();
-        Value *Dest = subtreeRoot->getValue();
-        mvec.push_back(BuildMI(V9::ANDNr, 3).addReg(LHS).addReg(notArg)
-                                       .addReg(Dest, MOTy::Def));
+        Value *lhs = subtreeRoot->leftChild()->getValue();
+        Value *dest = subtreeRoot->getValue();
+        mvec.push_back(BuildMI(V9::ANDNr, 3).addReg(lhs).addReg(notArg)
+                                       .addReg(dest, MOTy::Def));
+
+        if (notArg->getType() == Type::BoolTy) {
+          // set 1 in result register if result of above is non-zero
+          mvec.push_back(BuildMI(V9::MOVRNZi, 3).addReg(dest).addZImm(1)
+                         .addReg(dest, MOTy::UseAndDef));
+        }
+
         break;
       }
 
       case  39:        // bool:   Or(bool, bool)
+      case 139:        // bool:   Or(bool, not)
       case 239:        // bool:   Or(bool, boolconst)
       case 339:        // reg :   BOr(reg, reg)
       case 539:        // reg :   BOr(reg, Constant)
         Add3OperandInstr(V9::ORr, subtreeRoot, mvec);
         break;
 
-      case 139:        // bool:   Or(bool, not)
       case 439:        // bool:   BOr(bool, bnot)
       { // Use the argument of NOT as the second argument!
         // Mark the NOT node so that no code is generated for it.
+        // If the type is boolean, set 1 or 0 in the result register.
         InstructionNode* notNode = (InstructionNode*) subtreeRoot->rightChild();
         Value* notArg = BinaryOperator::getNotArgument(
                            cast<BinaryOperator>(notNode->getInstruction()));
         notNode->markFoldedIntoParent();
-        Value *LHS = subtreeRoot->leftChild()->getValue();
-        Value *Dest = subtreeRoot->getValue();
-        mvec.push_back(BuildMI(V9::ORNr, 3).addReg(LHS).addReg(notArg)
-                       .addReg(Dest, MOTy::Def));
+        Value *lhs = subtreeRoot->leftChild()->getValue();
+        Value *dest = subtreeRoot->getValue();
+
+        mvec.push_back(BuildMI(V9::ORNr, 3).addReg(lhs).addReg(notArg)
+                       .addReg(dest, MOTy::Def));
+
+        if (notArg->getType() == Type::BoolTy) {
+          // set 1 in result register if result of above is non-zero
+          mvec.push_back(BuildMI(V9::MOVRNZi, 3).addReg(dest).addZImm(1)
+                         .addReg(dest, MOTy::UseAndDef));
+        }
+
         break;
       }
 
       case  40:        // bool:   Xor(bool, bool)
+      case 140:        // bool:   Xor(bool, not)
       case 240:        // bool:   Xor(bool, boolconst)
       case 340:        // reg :   BXor(reg, reg)
       case 540:        // reg :   BXor(reg, Constant)
         Add3OperandInstr(V9::XORr, subtreeRoot, mvec);
         break;
 
-      case 140:        // bool:   Xor(bool, not)
       case 440:        // bool:   BXor(bool, bnot)
       { // Use the argument of NOT as the second argument!
         // Mark the NOT node so that no code is generated for it.
+        // If the type is boolean, set 1 or 0 in the result register.
         InstructionNode* notNode = (InstructionNode*) subtreeRoot->rightChild();
         Value* notArg = BinaryOperator::getNotArgument(
                            cast<BinaryOperator>(notNode->getInstruction()));
         notNode->markFoldedIntoParent();
-        Value *LHS = subtreeRoot->leftChild()->getValue();
-        Value *Dest = subtreeRoot->getValue();
-        mvec.push_back(BuildMI(V9::XNORr, 3).addReg(LHS).addReg(notArg)
-                       .addReg(Dest, MOTy::Def));
+        Value *lhs = subtreeRoot->leftChild()->getValue();
+        Value *dest = subtreeRoot->getValue();
+        mvec.push_back(BuildMI(V9::XNORr, 3).addReg(lhs).addReg(notArg)
+                       .addReg(dest, MOTy::Def));
+
+        if (notArg->getType() == Type::BoolTy) {
+          // set 1 in result register if result of above is non-zero
+          mvec.push_back(BuildMI(V9::MOVRNZi, 3).addReg(dest).addZImm(1)
+                         .addReg(dest, MOTy::UseAndDef));
+        }
         break;
       }
 
-      case 41: // boolconst:   SetCC(reg, Constant)
+      case 41: // setCCconst:   SetCC(reg, Constant)
+      { // Comparison is with a constant:
+        // 
+        // If the bool result must be computed into a register (see below),
+        // and the constant is int ZERO, we can use the MOVR[op] instructions
+        // and avoid the SUBcc instruction entirely.
+        // Otherwise this is just the same as case 42, so just fall through.
         // 
-        // If the SetCC was folded into the user (parent), it will be
-        // caught above.  All other cases are the same as case 42,
-        // so just fall through.
+        // The result of the SetCC must be computed and stored in a register if
+        // it is used outside the current basic block (so it must be computed
+        // as a boolreg) or it is used by anything other than a branch.
+        // We will use a conditional move to do this.
         // 
+        Instruction* setCCInstr = subtreeRoot->getInstruction();
+        bool computeBoolVal = (subtreeRoot->parent() == NULL ||
+                               ! AllUsesAreBranches(setCCInstr));
+
+        if (computeBoolVal) {
+          InstrTreeNode* constNode = subtreeRoot->rightChild();
+          assert(constNode &&
+                 constNode->getNodeType() ==InstrTreeNode::NTConstNode);
+          Constant *constVal = cast<Constant>(constNode->getValue());
+          bool isValidConst;
+          
+          if ((constVal->getType()->isInteger()
+               || isa<PointerType>(constVal->getType()))
+              && target.getInstrInfo().ConvertConstantToIntType(target,
+                             constVal, constVal->getType(), isValidConst) == 0
+              && isValidConst)
+          {
+            // That constant is an integer zero after all...
+            // Use a MOVR[op] to compute the boolean result
+            // Unconditionally set register to 0
+            mvec.push_back(BuildMI(V9::SETHI, 2).addZImm(0)
+                           .addRegDef(setCCInstr));
+                
+            // Now conditionally move 1 into the register.
+            // Mark the register as a use (as well as a def) because the old
+            // value will be retained if the condition is false.
+            MachineOpCode movOpCode = ChooseMovpregiForSetCC(subtreeRoot);
+            mvec.push_back(BuildMI(movOpCode, 3)
+                           .addReg(subtreeRoot->leftChild()->getValue())
+                           .addZImm(1).addReg(setCCInstr, MOTy::UseAndDef));
+                
+            break;
+          }
+        }
+        // ELSE FALL THROUGH
+      }
+
       case 42: // bool:   SetCC(reg, reg):
       {
         // This generates a SUBCC instruction, putting the difference in a
         // result reg. if needed, and/or setting a condition code if needed.
         // 
         Instruction* setCCInstr = subtreeRoot->getInstruction();
-        Value* leftVal = subtreeRoot->leftChild()->getValue();
-        bool isFPCompare = leftVal->getType()->isFloatingPoint();
+        Value* leftVal  = subtreeRoot->leftChild()->getValue();
+        Value* rightVal = subtreeRoot->rightChild()->getValue();
+        const Type* opType = leftVal->getType();
+        bool isFPCompare = opType->isFloatingPoint();
         
         // If the boolean result of the SetCC is used outside the current basic
         // block (so it must be computed as a boolreg) or is used by anything
@@ -2058,28 +2340,54 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         // 
         TmpInstruction* tmpForCC = GetTmpForCC(setCCInstr,
                                     setCCInstr->getParent()->getParent(),
-                                    isFPCompare ? Type::FloatTy : Type::IntTy,
+                                    leftVal->getType(),
                                     MachineCodeForInstruction::get(setCCInstr));
+
+        // If the operands are signed values smaller than 4 bytes, then they
+        // must be sign-extended in order to do a valid 32-bit comparison
+        // and get the right result in the 32-bit CC register (%icc).
+        // 
+        Value* leftOpToUse  = leftVal;
+        Value* rightOpToUse = rightVal;
+        if (opType->isIntegral() && opType->isSigned()) {
+          unsigned opSize = target.getTargetData().getTypeSize(opType);
+          if (opSize < 4) {
+            MachineCodeForInstruction& mcfi =
+              MachineCodeForInstruction::get(setCCInstr); 
+
+            // create temporary virtual regs. to hold the sign-extensions
+            leftOpToUse  = new TmpInstruction(mcfi, leftVal);
+            rightOpToUse = new TmpInstruction(mcfi, rightVal);
+            
+            // sign-extend each operand and put the result in the temporary reg.
+            target.getInstrInfo().CreateSignExtensionInstructions
+              (target, setCCInstr->getParent()->getParent(),
+               leftVal, leftOpToUse, 8*opSize, mvec, mcfi);
+            target.getInstrInfo().CreateSignExtensionInstructions
+              (target, setCCInstr->getParent()->getParent(),
+               rightVal, rightOpToUse, 8*opSize, mvec, mcfi);
+          }
+        }
+
         if (! isFPCompare) {
           // Integer condition: set CC and discard result.
-          M = BuildMI(V9::SUBccr, 4)
-            .addReg(subtreeRoot->leftChild()->getValue())
-            .addReg(subtreeRoot->rightChild()->getValue())
-            .addMReg(target.getRegInfo().getZeroRegNum(), MOTy::Def)
-            .addCCReg(tmpForCC, MOTy::Def);
+          mvec.push_back(BuildMI(V9::SUBccr, 4)
+                         .addReg(leftOpToUse)
+                         .addReg(rightOpToUse)
+                         .addMReg(target.getRegInfo().getZeroRegNum(),MOTy::Def)
+                         .addCCReg(tmpForCC, MOTy::Def));
         } else {
           // FP condition: dest of FCMP should be some FCCn register
-          M = BuildMI(ChooseFcmpInstruction(subtreeRoot), 3)
-            .addCCReg(tmpForCC, MOTy::Def)
-            .addReg(subtreeRoot->leftChild()->getValue())
-            .addReg(subtreeRoot->rightChild()->getValue());
+          mvec.push_back(BuildMI(ChooseFcmpInstruction(subtreeRoot), 3)
+                         .addCCReg(tmpForCC, MOTy::Def)
+                         .addReg(leftOpToUse)
+                         .addReg(rightOpToUse));
         }
-        mvec.push_back(M);
         
         if (computeBoolVal) {
           MachineOpCode movOpCode = (isFPCompare
                                      ? ChooseMovFpcciInstruction(subtreeRoot)
-                                     : ChooseMovpcciAfterSub(subtreeRoot));
+                                     : ChooseMovpcciForSetCC(subtreeRoot));
 
           // Unconditionally set register to 0
           M = BuildMI(V9::SETHI, 2).addZImm(0).addRegDef(setCCInstr);
@@ -2174,8 +2482,8 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         // This can also handle any intrinsics that are just function calls.
         // 
         if (! specialIntrinsic) {
-          MachineFunction& MF =
-            MachineFunction::get(callInstr->getParent()->getParent());
+          Function* currentFunc = callInstr->getParent()->getParent();
+          MachineFunction& MF = MachineFunction::get(currentFunc);
           MachineCodeForInstruction& mcfi =
             MachineCodeForInstruction::get(callInstr); 
           const UltraSparcRegInfo& regInfo =
@@ -2213,19 +2521,45 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
             new CallArgsDescriptor(callInstr, retAddrReg,isVarArgs,noPrototype);
           assert(callInstr->getOperand(0) == callee
                  && "This is assumed in the loop below!");
-          
+
+          // Insert sign-extension instructions for small signed values,
+          // if this is an unknown function (i.e., called via a funcptr)
+          // or an external one (i.e., which may not be compiled by llc).
+          // 
+          if (calledFunc == NULL || calledFunc->isExternal()) {
+            for (unsigned i=1, N=callInstr->getNumOperands(); i < N; ++i) {
+              Value* argVal = callInstr->getOperand(i);
+              const Type* argType = argVal->getType();
+              if (argType->isIntegral() && argType->isSigned()) {
+                unsigned argSize = target.getTargetData().getTypeSize(argType);
+                if (argSize <= 4) {
+                  // create a temporary virtual reg. to hold the sign-extension
+                  TmpInstruction* argExtend = new TmpInstruction(mcfi, argVal);
+
+                  // sign-extend argVal and put the result in the temporary reg.
+                  target.getInstrInfo().CreateSignExtensionInstructions
+                    (target, currentFunc, argVal, argExtend,
+                     8*argSize, mvec, mcfi);
+
+                  // replace argVal with argExtend in CallArgsDescriptor
+                  argDesc->getArgInfo(i-1).replaceArgVal(argExtend);
+                }
+              }
+            }
+          }
+
           // Insert copy instructions to get all the arguments into
           // all the places that they need to be.
           // 
           for (unsigned i=1, N=callInstr->getNumOperands(); i < N; ++i) {
             int argNo = i-1;
-            Value* argVal = callInstr->getOperand(i);
+            CallArgInfo& argInfo = argDesc->getArgInfo(argNo);
+            Value* argVal = argInfo.getArgVal(); // don't use callInstr arg here
             const Type* argType = argVal->getType();
-            unsigned regType = regInfo.getRegType(argType);
+            unsigned regType = regInfo.getRegTypeForDataType(argType);
             unsigned argSize = target.getTargetData().getTypeSize(argType);
             int regNumForArg = TargetRegInfo::getInvalidRegNum();
             unsigned regClassIDOfArgReg;
-            CallArgInfo& argInfo = argDesc->getArgInfo(argNo);
 
             // Check for FP arguments to varargs functions.
             // Any such argument in the first $K$ args must be passed in an
@@ -2234,15 +2568,31 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
             // K = #integer argument registers.
             bool isFPArg = argVal->getType()->isFloatingPoint();
             if (isVarArgs && isFPArg) {
-              // If it is a function with no prototype, pass value
-              // as an FP value as well as a varargs value
-              if (noPrototype)
-                argInfo.setUseFPArgReg();
-                
-              // If this arg. is in the first $K$ regs, add copy-
+
+              if (noPrototype) {
+                // It is a function with no prototype: pass value
+                // as an FP value as well as a varargs value.  The FP value
+                // may go in a register or on the stack.  The copy instruction
+                // to the outgoing reg/stack is created by the normal argument
+                // handling code since this is the "normal" passing mode.
+                // 
+                regNumForArg = regInfo.regNumForFPArg(regType,
+                                                      false, false, argNo,
+                                                      regClassIDOfArgReg);
+                if (regNumForArg == regInfo.getInvalidRegNum())
+                  argInfo.setUseStackSlot();
+                else
+                  argInfo.setUseFPArgReg();
+              }
+              
+              // If this arg. is in the first $K$ regs, add special copy-
               // float-to-int instructions to pass the value as an int.
-              // To check if it is in teh first $K$, get the register
-              // number for the arg #i.
+              // To check if it is in the first $K$, get the register
+              // number for the arg #i.  These copy instructions are
+              // generated here because they are extra cases and not needed
+              // for the normal argument handling (some code reuse is
+              // possible though -- later).
+              // 
               int copyRegNum = regInfo.regNumForIntArg(false, false, argNo,
                                                        regClassIDOfArgReg);
               if (copyRegNum != regInfo.getInvalidRegNum()) {
@@ -2254,7 +2604,7 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
                                                              argVal, NULL,
                                                              "argRegCopy");
                 callMI->addImplicitRef(argVReg);
-                        
+                
                 // Get a temp stack location to use to copy
                 // float-to-int via the stack.
                 // 
@@ -2341,14 +2691,16 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
               M = BuildMI(storeOpCode, 3).addReg(argVal)
                 .addMReg(regInfo.getStackPointer()).addSImm(argOffset);
               mvec.push_back(M);
-            } else {
+            }
+            else if (regNumForArg != regInfo.getInvalidRegNum()) {
+
               // Create a virtual register to represent the arg reg. Mark
               // this vreg as being an implicit operand of the call MI.
               TmpInstruction* argVReg = 
                 new TmpInstruction(mcfi, argVal, NULL, "argReg");
 
               callMI->addImplicitRef(argVReg);
-                    
+              
               // Generate the reg-to-reg copy into the outgoing arg reg.
               // -- For FP values, create a FMOVS or FMOVD instruction
               // -- For non-FP values, create an add-with-0 instruction
@@ -2359,7 +2711,7 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
                 M = (BuildMI(ChooseAddInstructionByType(argType), 3)
                      .addReg(argVal).addSImm((int64_t) 0)
                      .addReg(argVReg, MOTy::Def));
-                    
+              
               // Mark the operand with the register it should be assigned
               M->SetRegForOperand(M->getNumOperands()-1, regNumForArg);
               callMI->SetRegForImplicitRef(callMI->getNumImplicitRefs()-1,
@@ -2367,6 +2719,9 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
 
               mvec.push_back(M);
             }
+            else
+              assert(argInfo.getArgCopy() != regInfo.getInvalidRegNum() &&
+                     "Arg. not in stack slot, primary or secondary register?");
           }
 
           // add call instruction and delay slot before copying return value
@@ -2432,9 +2787,10 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         const Type* opType = argVal1->getType();
         assert((opType->isInteger() || isa<PointerType>(opType)) &&
                "Shl unsupported for other types");
+        unsigned opSize = target.getTargetData().getTypeSize(opType);
         
         CreateShiftInstructions(target, shlInstr->getParent()->getParent(),
-                                (opType == Type::LongTy)? V9::SLLXr6:V9::SLLr5,
+                                (opSize > 4)? V9::SLLXr6:V9::SLLr5,
                                 argVal1, argVal2, 0, shlInstr, mvec,
                                 MachineCodeForInstruction::get(shlInstr));
         break;
@@ -2445,9 +2801,10 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
         const Type* opType = subtreeRoot->leftChild()->getValue()->getType();
         assert((opType->isInteger() || isa<PointerType>(opType)) &&
                "Shr unsupported for other types");
+        unsigned opSize = target.getTargetData().getTypeSize(opType);
         Add3OperandInstr(opType->isSigned()
-                         ? (opType == Type::LongTy ? V9::SRAXr6 : V9::SRAr5)
-                         : (opType == Type::LongTy ? V9::SRLXr6 : V9::SRLr5),
+                         ? (opSize > 4? V9::SRAXr6 : V9::SRAr5)
+                         : (opSize > 4? V9::SRLXr6 : V9::SRLr5),
                          subtreeRoot, mvec);
         break;
       }
@@ -2455,16 +2812,28 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
       case 64: // reg:   Phi(reg,reg)
         break;                          // don't forward the value
 
-      case 65: // reg:   VaArg(reg)
-      {
-        // Use value initialized by va_start as pointer to args on the stack.
-        // Load argument via current pointer value, then increment pointer.
+      case 65: // reg:   VANext(reg):  the va_next(va_list, type) instruction
+      { // Increment the va_list pointer register according to the type.
+        // All LLVM argument types are <= 64 bits, so use one doubleword.
+        Instruction* vaNextI = subtreeRoot->getInstruction();
+        assert(target.getTargetData().getTypeSize(vaNextI->getType()) <= 8 &&
+               "We assumed that all LLVM parameter types <= 8 bytes!");
         int argSize = target.getFrameInfo().getSizeOfEachArgOnStack();
+        mvec.push_back(BuildMI(V9::ADDi, 3).addReg(vaNextI->getOperand(0)).
+                       addSImm(argSize).addRegDef(vaNextI));
+        break;
+      }
+
+      case 66: // reg:   VAArg (reg): the va_arg instruction
+      { // Load argument from stack using current va_list pointer value.
+        // Use 64-bit load for all non-FP args, and LDDF or double for FP.
         Instruction* vaArgI = subtreeRoot->getInstruction();
-        mvec.push_back(BuildMI(V9::LDXi, 3).addReg(vaArgI->getOperand(0)).
+        MachineOpCode loadOp = (vaArgI->getType()->isFloatingPoint()
+                                ? (vaArgI->getType() == Type::FloatTy
+                                   ? V9::LDFi : V9::LDDFi)
+                                : V9::LDXi);
+        mvec.push_back(BuildMI(loadOp, 3).addReg(vaArgI->getOperand(0)).
                        addSImm(0).addRegDef(vaArgI));
-        mvec.push_back(BuildMI(V9::ADDi, 3).addReg(vaArgI->getOperand(0)).
-                       addSImm(argSize).addRegDef(vaArgI->getOperand(0)));
         break;
       }
       
@@ -2507,20 +2876,53 @@ GetInstructionsByRule(InstructionNode* subtreeRoot,
     if (dest->getType()->isUnsigned()) {
       unsigned destSize=target.getTargetData().getTypeSize(dest->getType());
       if (destSize <= 4) {
-        // Mask high bits.  Use a TmpInstruction to represent the
+        // Mask high 64 - N bits, where N = 4*destSize.
+        
+        // Use a TmpInstruction to represent the
         // intermediate result before masking.  Since those instructions
         // have already been generated, go back and substitute tmpI
         // for dest in the result position of each one of them.
-        TmpInstruction *tmpI =
-          new TmpInstruction(MachineCodeForInstruction::get(dest),
-                             dest->getType(), dest, NULL, "maskHi");
+        // 
+        MachineCodeForInstruction& mcfi = MachineCodeForInstruction::get(dest);
+        TmpInstruction *tmpI = new TmpInstruction(mcfi, dest->getType(),
+                                                  dest, NULL, "maskHi");
+        Value* srlArgToUse = tmpI;
+
+        unsigned numSubst = 0;
+        for (unsigned i=0, N=mvec.size(); i < N; ++i) {
+
+          // Make sure we substitute all occurrences of dest in these instrs.
+          // Otherwise, we will have bogus code.
+          bool someArgsWereIgnored = false;
+
+          // Make sure not to substitute an upwards-exposed use -- that would
+          // introduce a use of `tmpI' with no preceding def.  Therefore,
+          // substitute a use or def-and-use operand only if a previous def
+          // operand has already been substituted (i.e., numSusbt > 0).
+          // 
+          numSubst += mvec[i]->substituteValue(dest, tmpI,
+                                               /*defsOnly*/ numSubst == 0,
+                                               /*notDefsAndUses*/ numSubst > 0,
+                                               someArgsWereIgnored);
+          assert(!someArgsWereIgnored &&
+                 "Operand `dest' exists but not replaced: probably bogus!");
+        }
+        assert(numSubst > 0 && "Operand `dest' not replaced: probably bogus!");
+
+        // Left shift 32-N if size (N) is less than 32 bits.
+        // Use another tmp. virtual register to represent this result.
+        if (destSize < 4) {
+          srlArgToUse = new TmpInstruction(mcfi, dest->getType(),
+                                           tmpI, NULL, "maskHi2");
+          mvec.push_back(BuildMI(V9::SLLXi6, 3).addReg(tmpI)
+                         .addZImm(8*(4-destSize))
+                         .addReg(srlArgToUse, MOTy::Def));
+        }
 
-        for (unsigned i=0, N=mvec.size(); i < N; ++i)
-          mvec[i]->substituteValue(dest, tmpI);
+        // Logical right shift 32-N to get zero extension in top 64-N bits.
+        mvec.push_back(BuildMI(V9::SRLi5, 3).addReg(srlArgToUse)
+                       .addZImm(8*(4-destSize)).addReg(dest, MOTy::Def));
 
-        M = BuildMI(V9::SRLi5, 3).addReg(tmpI).addZImm(8*(4-destSize))
-          .addReg(dest, MOTy::Def);
-        mvec.push_back(M);
       } else if (destSize < 8) {
         assert(0 && "Unsupported type size: 32 < size < 64 bits");
       }