lib/Target/R600/AMDILPeepholeOptimizer.cpp

   1 //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 /// \file
   9 //==-----------------------------------------------------------------------===//
  10
  11 #define DEBUG_TYPE "PeepholeOpt"
  12 #ifdef DEBUG
  13 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
  14 #else
  15 #define DEBUGME 0
  16 #endif
  17
  18 #include "AMDILDevices.h"
  19 #include "AMDGPUInstrInfo.h"
  20 #include "llvm/ADT/Statistic.h"
  21 #include "llvm/ADT/StringExtras.h"
  22 #include "llvm/ADT/StringRef.h"
  23 #include "llvm/ADT/Twine.h"
  24 #include "llvm/IR/Constants.h"
  25 #include "llvm/CodeGen/MachineFunction.h"
  26 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
  27 #include "llvm/IR/Function.h"
  28 #include "llvm/IR/Instructions.h"
  29 #include "llvm/IR/Module.h"
  30 #include "llvm/Support/Debug.h"
  31 #include "llvm/Support/MathExtras.h"
  32
  33 #include <sstream>
  34
  35 #if 0
  36 STATISTIC(PointerAssignments, "Number of dynamic pointer "
  37     "assigments discovered");
  38 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
  39 #endif
  40
  41 using namespace llvm;
  42 // The Peephole optimization pass is used to do simple last minute optimizations
  43 // that are required for correct code or to remove redundant functions
  44 namespace {
  45
  46 class OpaqueType;
  47
  48 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
  49 public:
  50   TargetMachine &TM;
  51   static char ID;
  52   AMDGPUPeepholeOpt(TargetMachine &tm);
  53   ~AMDGPUPeepholeOpt();
  54   const char *getPassName() const;
  55   bool runOnFunction(Function &F);
  56   bool doInitialization(Module &M);
  57   bool doFinalization(Module &M);
  58   void getAnalysisUsage(AnalysisUsage &AU) const;
  59 protected:
  60 private:
  61   // Function to initiate all of the instruction level optimizations.
  62   bool instLevelOptimizations(BasicBlock::iterator *inst);
  63   // Quick check to see if we need to dump all of the pointers into the
  64   // arena. If this is correct, then we set all pointers to exist in arena. This
  65   // is a workaround for aliasing of pointers in a struct/union.
  66   bool dumpAllIntoArena(Function &F);
  67   // Because I don't want to invalidate any pointers while in the
  68   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
  69   // it later. This function does the conversions if required.
  70   void doAtomicConversionIfNeeded(Function &F);
  71   // Because __amdil_is_constant cannot be properly evaluated if
  72   // optimizations are disabled, the call's are placed in a vector
  73   // and evaluated after the __amdil_image* functions are evaluated
  74   // which should allow the __amdil_is_constant function to be
  75   // evaluated correctly.
  76   void doIsConstCallConversionIfNeeded();
  77   bool mChanged;
  78   bool mDebug;
  79   bool mConvertAtomics;
  80   CodeGenOpt::Level optLevel;
  81   // Run a series of tests to see if we can optimize a CALL instruction.
  82   bool optimizeCallInst(BasicBlock::iterator *bbb);
  83   // A peephole optimization to optimize bit extract sequences.
  84   bool optimizeBitExtract(Instruction *inst);
  85   // A peephole optimization to optimize bit insert sequences.
  86   bool optimizeBitInsert(Instruction *inst);
  87   bool setupBitInsert(Instruction *base,
  88                       Instruction *&src,
  89                       Constant *&mask,
  90                       Constant *&shift);
  91   // Expand the bit field insert instruction on versions of OpenCL that
  92   // don't support it.
  93   bool expandBFI(CallInst *CI);
  94   // Expand the bit field mask instruction on version of OpenCL that
  95   // don't support it.
  96   bool expandBFM(CallInst *CI);
  97   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
  98   // this case we need to expand them. These functions check for 24bit functions
  99   // and then expand.
 100   bool isSigned24BitOps(CallInst *CI);
 101   void expandSigned24BitOps(CallInst *CI);
 102   // One optimization that can occur is that if the required workgroup size is
 103   // specified then the result of get_local_size is known at compile time and
 104   // can be returned accordingly.
 105   bool isRWGLocalOpt(CallInst *CI);
 106   // On northern island cards, the division is slightly less accurate than on
 107   // previous generations, so we need to utilize a more accurate division. So we
 108   // can translate the accurate divide to a normal divide on all other cards.
 109   bool convertAccurateDivide(CallInst *CI);
 110   void expandAccurateDivide(CallInst *CI);
 111   // If the alignment is set incorrectly, it can produce really inefficient
 112   // code. This checks for this scenario and fixes it if possible.
 113   bool correctMisalignedMemOp(Instruction *inst);
 114
 115   // If we are in no opt mode, then we need to make sure that
 116   // local samplers are properly propagated as constant propagation
 117   // doesn't occur and we need to know the value of kernel defined
 118   // samplers at compile time.
 119   bool propagateSamplerInst(CallInst *CI);
 120
 121   // Helper functions
 122
 123   // Group of functions that recursively calculate the size of a structure based
 124   // on it's sub-types.
 125   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
 126   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
 127   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
 128   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
 129   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
 130   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
 131   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
 132   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
 133
 134   LLVMContext *mCTX;
 135   Function *mF;
 136   const AMDGPUSubtarget *mSTM;
 137   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
 138   SmallVector<CallInst *, 16> isConstVec;
 139 }; // class AMDGPUPeepholeOpt
 140   char AMDGPUPeepholeOpt::ID = 0;
 141
 142 // A template function that has two levels of looping before calling the
 143 // function with a pointer to the current iterator.
 144 template<class InputIterator, class SecondIterator, class Function>
 145 Function safeNestedForEach(InputIterator First, InputIterator Last,
 146                               SecondIterator S, Function F) {
 147   for ( ; First != Last; ++First) {
 148     SecondIterator sf, sl;
 149     for (sf = First->begin(), sl = First->end();
 150          sf != sl; )  {
 151       if (!F(&sf)) {
 152         ++sf;
 153       }
 154     }
 155   }
 156   return F;
 157 }
 158
 159 } // anonymous namespace
 160
 161 namespace llvm {
 162   FunctionPass *
 163   createAMDGPUPeepholeOpt(TargetMachine &tm) {
 164     return new AMDGPUPeepholeOpt(tm);
 165   }
 166 } // llvm namespace
 167
 168 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
 169   : FunctionPass(ID), TM(tm)  {
 170   mDebug = DEBUGME;
 171   optLevel = TM.getOptLevel();
 172
 173 }
 174
 175 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
 176 }
 177
 178 const char *
 179 AMDGPUPeepholeOpt::getPassName() const  {
 180   return "AMDGPU PeepHole Optimization Pass";
 181 }
 182
 183 bool
 184 containsPointerType(Type *Ty)  {
 185   if (!Ty) {
 186     return false;
 187   }
 188   switch(Ty->getTypeID()) {
 189   default:
 190     return false;
 191   case Type::StructTyID: {
 192     const StructType *ST = dyn_cast<StructType>(Ty);
 193     for (StructType::element_iterator stb = ST->element_begin(),
 194            ste = ST->element_end(); stb != ste; ++stb) {
 195       if (!containsPointerType(*stb)) {
 196         continue;
 197       }
 198       return true;
 199     }
 200     break;
 201   }
 202   case Type::VectorTyID:
 203   case Type::ArrayTyID:
 204     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
 205   case Type::PointerTyID:
 206     return true;
 207   };
 208   return false;
 209 }
 210
 211 bool
 212 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
 213   bool dumpAll = false;
 214   for (Function::const_arg_iterator cab = F.arg_begin(),
 215        cae = F.arg_end(); cab != cae; ++cab) {
 216     const Argument *arg = cab;
 217     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
 218     if (!PT) {
 219       continue;
 220     }
 221     Type *DereferencedType = PT->getElementType();
 222     if (!dyn_cast<StructType>(DereferencedType)
 223         ) {
 224       continue;
 225     }
 226     if (!containsPointerType(DereferencedType)) {
 227       continue;
 228     }
 229     // FIXME: Because a pointer inside of a struct/union may be aliased to
 230     // another pointer we need to take the conservative approach and place all
 231     // pointers into the arena until more advanced detection is implemented.
 232     dumpAll = true;
 233   }
 234   return dumpAll;
 235 }
 236 void
 237 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
 238   if (isConstVec.empty()) {
 239     return;
 240   }
 241   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
 242     CallInst *CI = isConstVec[x];
 243     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 244     Type *aType = Type::getInt32Ty(*mCTX);
 245     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 246       : ConstantInt::get(aType, 0);
 247     CI->replaceAllUsesWith(Val);
 248     CI->eraseFromParent();
 249   }
 250   isConstVec.clear();
 251 }
 252 void
 253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
 254   // Don't do anything if we don't have any atomic operations.
 255   if (atomicFuncs.empty()) {
 256     return;
 257   }
 258   // Change the function name for the atomic if it is required
 259   uint32_t size = atomicFuncs.size();
 260   for (uint32_t x = 0; x < size; ++x) {
 261     atomicFuncs[x].first->setOperand(
 262         atomicFuncs[x].first->getNumOperands()-1,
 263         atomicFuncs[x].second);
 264
 265   }
 266   mChanged = true;
 267   if (mConvertAtomics) {
 268     return;
 269   }
 270 }
 271
 272 bool
 273 AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
 274   mChanged = false;
 275   mF = &MF;
 276   mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
 277   if (mDebug) {
 278     MF.dump();
 279   }
 280   mCTX = &MF.getType()->getContext();
 281   mConvertAtomics = true;
 282   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
 283      std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
 284                   this));
 285
 286   doAtomicConversionIfNeeded(MF);
 287   doIsConstCallConversionIfNeeded();
 288
 289   if (mDebug) {
 290     MF.dump();
 291   }
 292   return mChanged;
 293 }
 294
 295 bool
 296 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
 297   Instruction *inst = (*bbb);
 298   CallInst *CI = dyn_cast<CallInst>(inst);
 299   if (!CI) {
 300     return false;
 301   }
 302   if (isSigned24BitOps(CI)) {
 303     expandSigned24BitOps(CI);
 304     ++(*bbb);
 305     CI->eraseFromParent();
 306     return true;
 307   }
 308   if (propagateSamplerInst(CI)) {
 309     return false;
 310   }
 311   if (expandBFI(CI) || expandBFM(CI)) {
 312     ++(*bbb);
 313     CI->eraseFromParent();
 314     return true;
 315   }
 316   if (convertAccurateDivide(CI)) {
 317     expandAccurateDivide(CI);
 318     ++(*bbb);
 319     CI->eraseFromParent();
 320     return true;
 321   }
 322
 323   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
 324   if (calleeName.startswith("__amdil_is_constant")) {
 325     // If we do not have optimizations, then this
 326     // cannot be properly evaluated, so we add the
 327     // call instruction to a vector and process
 328     // them at the end of processing after the
 329     // samplers have been correctly handled.
 330     if (optLevel == CodeGenOpt::None) {
 331       isConstVec.push_back(CI);
 332       return false;
 333     } else {
 334       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 335       Type *aType = Type::getInt32Ty(*mCTX);
 336       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 337         : ConstantInt::get(aType, 0);
 338       CI->replaceAllUsesWith(Val);
 339       ++(*bbb);
 340       CI->eraseFromParent();
 341       return true;
 342     }
 343   }
 344
 345   if (calleeName.equals("__amdil_is_asic_id_i32")) {
 346     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
 347     Type *aType = Type::getInt32Ty(*mCTX);
 348     Value *Val = CV;
 349     if (Val) {
 350       Val = ConstantInt::get(aType,
 351           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
 352     } else {
 353       Val = ConstantInt::get(aType, 0);
 354     }
 355     CI->replaceAllUsesWith(Val);
 356     ++(*bbb);
 357     CI->eraseFromParent();
 358     return true;
 359   }
 360   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
 361   if (!F) {
 362     return false;
 363   }
 364   if (F->getName().startswith("__atom") && !CI->getNumUses()
 365       && F->getName().find("_xchg") == StringRef::npos) {
 366     std::string buffer(F->getName().str() + "_noret");
 367     F = dyn_cast<Function>(
 368           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
 369     atomicFuncs.push_back(std::make_pair(CI, F));
 370   }
 371
 372   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
 373       && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
 374     return false;
 375   }
 376   if (!mConvertAtomics) {
 377     return false;
 378   }
 379   StringRef name = F->getName();
 380   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
 381     mConvertAtomics = false;
 382   }
 383   return false;
 384 }
 385
 386 bool
 387 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
 388     Instruction *&src,
 389     Constant *&mask,
 390     Constant *&shift) {
 391   if (!base) {
 392     if (mDebug) {
 393       dbgs() << "Null pointer passed into function.\n";
 394     }
 395     return false;
 396   }
 397   bool andOp = false;
 398   if (base->getOpcode() == Instruction::Shl) {
 399     shift = dyn_cast<Constant>(base->getOperand(1));
 400   } else if (base->getOpcode() == Instruction::And) {
 401     mask = dyn_cast<Constant>(base->getOperand(1));
 402     andOp = true;
 403   } else {
 404     if (mDebug) {
 405       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
 406     }
 407     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
 408     return false;
 409   }
 410   src = dyn_cast<Instruction>(base->getOperand(0));
 411   if (!src) {
 412     if (mDebug) {
 413       dbgs() << "Failed setup since the base operand is not an instruction!\n";
 414     }
 415     return false;
 416   }
 417   // If we find an 'and' operation, then we don't need to
 418   // find the next operation as we already know the
 419   // bits that are valid at this point.
 420   if (andOp) {
 421     return true;
 422   }
 423   if (src->getOpcode() == Instruction::Shl && !shift) {
 424     shift = dyn_cast<Constant>(src->getOperand(1));
 425     src = dyn_cast<Instruction>(src->getOperand(0));
 426   } else if (src->getOpcode() == Instruction::And && !mask) {
 427     mask = dyn_cast<Constant>(src->getOperand(1));
 428   }
 429   if (!mask && !shift) {
 430     if (mDebug) {
 431       dbgs() << "Failed setup since both mask and shift are NULL!\n";
 432     }
 433     // Did not find a constant mask or a shift.
 434     return false;
 435   }
 436   return true;
 437 }
 438 bool
 439 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
 440   if (!inst) {
 441     return false;
 442   }
 443   if (!inst->isBinaryOp()) {
 444     return false;
 445   }
 446   if (inst->getOpcode() != Instruction::Or) {
 447     return false;
 448   }
 449   if (optLevel == CodeGenOpt::None) {
 450     return false;
 451   }
 452   // We want to do an optimization on a sequence of ops that in the end equals a
 453   // single ISA instruction.
 454   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
 455   // Some simplified versions of this pattern are as follows:
 456   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
 457   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
 458   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
 459   // (A & B) | (D << F) when (1 << F) >= B
 460   // (A << C) | (D & E) when (1 << C) >= E
 461   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
 462     // The HD4XXX hardware doesn't support the ubit_insert instruction.
 463     return false;
 464   }
 465   Type *aType = inst->getType();
 466   bool isVector = aType->isVectorTy();
 467   int numEle = 1;
 468   // This optimization only works on 32bit integers.
 469   if (aType->getScalarType()
 470       != Type::getInt32Ty(inst->getContext())) {
 471     return false;
 472   }
 473   if (isVector) {
 474     const VectorType *VT = dyn_cast<VectorType>(aType);
 475     numEle = VT->getNumElements();
 476     // We currently cannot support more than 4 elements in a intrinsic and we
 477     // cannot support Vec3 types.
 478     if (numEle > 4 || numEle == 3) {
 479       return false;
 480     }
 481   }
 482   // TODO: Handle vectors.
 483   if (isVector) {
 484     if (mDebug) {
 485       dbgs() << "!!! Vectors are not supported yet!\n";
 486     }
 487     return false;
 488   }
 489   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
 490   Constant *LHSMask = NULL, *RHSMask = NULL;
 491   Constant *LHSShift = NULL, *RHSShift = NULL;
 492   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
 493   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
 494   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
 495     if (mDebug) {
 496       dbgs() << "Found an OR Operation that failed setup!\n";
 497       inst->dump();
 498       if (LHS) { LHS->dump(); }
 499       if (LHSSrc) { LHSSrc->dump(); }
 500       if (LHSMask) { LHSMask->dump(); }
 501       if (LHSShift) { LHSShift->dump(); }
 502     }
 503     // There was an issue with the setup for BitInsert.
 504     return false;
 505   }
 506   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
 507     if (mDebug) {
 508       dbgs() << "Found an OR Operation that failed setup!\n";
 509       inst->dump();
 510       if (RHS) { RHS->dump(); }
 511       if (RHSSrc) { RHSSrc->dump(); }
 512       if (RHSMask) { RHSMask->dump(); }
 513       if (RHSShift) { RHSShift->dump(); }
 514     }
 515     // There was an issue with the setup for BitInsert.
 516     return false;
 517   }
 518   if (mDebug) {
 519     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
 520     dbgs() << "Op:        "; inst->dump();
 521     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
 522     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 523     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
 524     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
 525     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
 526     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 527     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
 528     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
 529   }
 530   Constant *offset = NULL;
 531   Constant *width = NULL;
 532   uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
 533   uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
 534   uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
 535   uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
 536   lhsMaskVal = (LHSMask
 537       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
 538   rhsMaskVal = (RHSMask
 539       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
 540   lhsShiftVal = (LHSShift
 541       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
 542   rhsShiftVal = (RHSShift
 543       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
 544   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
 545   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
 546   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
 547   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
 548   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
 549   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
 550     return false;
 551   }
 552   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
 553     offset = ConstantInt::get(aType, lhsMaskOffset, false);
 554     width = ConstantInt::get(aType, lhsMaskWidth, false);
 555     RHSSrc = RHS;
 556     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
 557       return false;
 558     }
 559     if (!LHSShift) {
 560       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 561           "MaskShr", LHS);
 562     } else if (lhsShiftVal != lhsMaskOffset) {
 563       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 564           "MaskShr", LHS);
 565     }
 566     if (mDebug) {
 567       dbgs() << "Optimizing LHS!\n";
 568     }
 569   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
 570     offset = ConstantInt::get(aType, rhsMaskOffset, false);
 571     width = ConstantInt::get(aType, rhsMaskWidth, false);
 572     LHSSrc = RHSSrc;
 573     RHSSrc = LHS;
 574     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
 575       return false;
 576     }
 577     if (!RHSShift) {
 578       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 579           "MaskShr", RHS);
 580     } else if (rhsShiftVal != rhsMaskOffset) {
 581       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 582           "MaskShr", RHS);
 583     }
 584     if (mDebug) {
 585       dbgs() << "Optimizing RHS!\n";
 586     }
 587   } else {
 588     if (mDebug) {
 589       dbgs() << "Failed constraint 3!\n";
 590     }
 591     return false;
 592   }
 593   if (mDebug) {
 594     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
 595     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
 596     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 597     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 598   }
 599   if (!offset || !width) {
 600     if (mDebug) {
 601       dbgs() << "Either width or offset are NULL, failed detection!\n";
 602     }
 603     return false;
 604   }
 605   // Lets create the function signature.
 606   std::vector<Type *> callTypes;
 607   callTypes.push_back(aType);
 608   callTypes.push_back(aType);
 609   callTypes.push_back(aType);
 610   callTypes.push_back(aType);
 611   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 612   std::string name = "__amdil_ubit_insert";
 613   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
 614   Function *Func =
 615     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 616         getOrInsertFunction(StringRef(name), funcType));
 617   Value *Operands[4] = {
 618     width,
 619     offset,
 620     LHSSrc,
 621     RHSSrc
 622   };
 623   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
 624   if (mDebug) {
 625     dbgs() << "Old Inst: ";
 626     inst->dump();
 627     dbgs() << "New Inst: ";
 628     CI->dump();
 629     dbgs() << "\n\n";
 630   }
 631   CI->insertBefore(inst);
 632   inst->replaceAllUsesWith(CI);
 633   return true;
 634 }
 635
 636 bool
 637 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
 638   if (!inst) {
 639     return false;
 640   }
 641   if (!inst->isBinaryOp()) {
 642     return false;
 643   }
 644   if (inst->getOpcode() != Instruction::And) {
 645     return false;
 646   }
 647   if (optLevel == CodeGenOpt::None) {
 648     return false;
 649   }
 650   // We want to do some simple optimizations on Shift right/And patterns. The
 651   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
 652   // value smaller than 32 and C is a mask. If C is a constant value, then the
 653   // following transformation can occur. For signed integers, it turns into the
 654   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
 655   // integers, it turns into the function call dst =
 656   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
 657   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
 658   // Evergreen hardware.
 659   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
 660     // This does not work on HD4XXX hardware.
 661     return false;
 662   }
 663   Type *aType = inst->getType();
 664   bool isVector = aType->isVectorTy();
 665
 666   // XXX Support vector types
 667   if (isVector) {
 668     return false;
 669   }
 670   int numEle = 1;
 671   // This only works on 32bit integers
 672   if (aType->getScalarType()
 673       != Type::getInt32Ty(inst->getContext())) {
 674     return false;
 675   }
 676   if (isVector) {
 677     const VectorType *VT = dyn_cast<VectorType>(aType);
 678     numEle = VT->getNumElements();
 679     // We currently cannot support more than 4 elements in a intrinsic and we
 680     // cannot support Vec3 types.
 681     if (numEle > 4 || numEle == 3) {
 682       return false;
 683     }
 684   }
 685   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
 686   // If the first operand is not a shift instruction, then we can return as it
 687   // doesn't match this pattern.
 688   if (!ShiftInst || !ShiftInst->isShift()) {
 689     return false;
 690   }
 691   // If we are a shift left, then we need don't match this pattern.
 692   if (ShiftInst->getOpcode() == Instruction::Shl) {
 693     return false;
 694   }
 695   bool isSigned = ShiftInst->isArithmeticShift();
 696   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
 697   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
 698   // Lets make sure that the shift value and the and mask are constant integers.
 699   if (!AndMask || !ShrVal) {
 700     return false;
 701   }
 702   Constant *newMaskConst;
 703   Constant *shiftValConst;
 704   if (isVector) {
 705     // Handle the vector case
 706     std::vector<Constant *> maskVals;
 707     std::vector<Constant *> shiftVals;
 708     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
 709     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
 710     Type *scalarType = AndMaskVec->getType()->getScalarType();
 711     assert(AndMaskVec->getNumOperands() ==
 712            ShrValVec->getNumOperands() && "cannot have a "
 713            "combination where the number of elements to a "
 714            "shift and an and are different!");
 715     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
 716       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
 717       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
 718       if (!AndCI || !ShiftIC) {
 719         return false;
 720       }
 721       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
 722       if (!isMask_32(maskVal)) {
 723         return false;
 724       }
 725       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 726       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
 727       // If the mask or shiftval is greater than the bitcount, then break out.
 728       if (maskVal >= 32 || shiftVal >= 32) {
 729         return false;
 730       }
 731       // If the mask val is greater than the the number of original bits left
 732       // then this optimization is invalid.
 733       if (maskVal > (32 - shiftVal)) {
 734         return false;
 735       }
 736       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
 737       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
 738     }
 739     newMaskConst = ConstantVector::get(maskVals);
 740     shiftValConst = ConstantVector::get(shiftVals);
 741   } else {
 742     // Handle the scalar case
 743     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
 744     // This must be a mask value where all lower bits are set to 1 and then any
 745     // bit higher is set to 0.
 746     if (!isMask_32(maskVal)) {
 747       return false;
 748     }
 749     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 750     // Count the number of bits set in the mask, this is the width of the
 751     // resulting bit set that is extracted from the source value.
 752     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
 753     // If the mask or shift val is greater than the bitcount, then break out.
 754     if (maskVal >= 32 || shiftVal >= 32) {
 755       return false;
 756     }
 757     // If the mask val is greater than the the number of original bits left then
 758     // this optimization is invalid.
 759     if (maskVal > (32 - shiftVal)) {
 760       return false;
 761     }
 762     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
 763     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
 764   }
 765   // Lets create the function signature.
 766   std::vector<Type *> callTypes;
 767   callTypes.push_back(aType);
 768   callTypes.push_back(aType);
 769   callTypes.push_back(aType);
 770   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 771   std::string name = "llvm.AMDGPU.bit.extract.u32";
 772   if (isVector) {
 773     name += ".v" + itostr(numEle) + "i32";
 774   } else {
 775     name += ".";
 776   }
 777   // Lets create the function.
 778   Function *Func =
 779     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 780                        getOrInsertFunction(StringRef(name), funcType));
 781   Value *Operands[3] = {
 782     ShiftInst->getOperand(0),
 783     shiftValConst,
 784     newMaskConst
 785   };
 786   // Lets create the Call with the operands
 787   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
 788   CI->setDoesNotAccessMemory();
 789   CI->insertBefore(inst);
 790   inst->replaceAllUsesWith(CI);
 791   return true;
 792 }
 793
 794 bool
 795 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
 796   if (!CI) {
 797     return false;
 798   }
 799   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 800   if (!LHS->getName().startswith("__amdil_bfi")) {
 801     return false;
 802   }
 803   Type* type = CI->getOperand(0)->getType();
 804   Constant *negOneConst = NULL;
 805   if (type->isVectorTy()) {
 806     std::vector<Constant *> negOneVals;
 807     negOneConst = ConstantInt::get(CI->getContext(),
 808         APInt(32, StringRef("-1"), 10));
 809     for (size_t x = 0,
 810         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 811       negOneVals.push_back(negOneConst);
 812     }
 813     negOneConst = ConstantVector::get(negOneVals);
 814   } else {
 815     negOneConst = ConstantInt::get(CI->getContext(),
 816         APInt(32, StringRef("-1"), 10));
 817   }
 818   // __amdil_bfi => (A & B) | (~A & C)
 819   BinaryOperator *lhs =
 820     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 821         CI->getOperand(1), "bfi_and", CI);
 822   BinaryOperator *rhs =
 823     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
 824         "bfi_not", CI);
 825   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
 826       "bfi_and", CI);
 827   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
 828   CI->replaceAllUsesWith(lhs);
 829   return true;
 830 }
 831
 832 bool
 833 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
 834   if (!CI) {
 835     return false;
 836   }
 837   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 838   if (!LHS->getName().startswith("__amdil_bfm")) {
 839     return false;
 840   }
 841   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
 842   Constant *newMaskConst = NULL;
 843   Constant *newShiftConst = NULL;
 844   Type* type = CI->getOperand(0)->getType();
 845   if (type->isVectorTy()) {
 846     std::vector<Constant*> newMaskVals, newShiftVals;
 847     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 848     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 849     for (size_t x = 0,
 850         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 851       newMaskVals.push_back(newMaskConst);
 852       newShiftVals.push_back(newShiftConst);
 853     }
 854     newMaskConst = ConstantVector::get(newMaskVals);
 855     newShiftConst = ConstantVector::get(newShiftVals);
 856   } else {
 857     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 858     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 859   }
 860   BinaryOperator *lhs =
 861     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 862         newMaskConst, "bfm_mask", CI);
 863   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
 864       lhs, "bfm_shl", CI);
 865   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
 866       newShiftConst, "bfm_sub", CI);
 867   BinaryOperator *rhs =
 868     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
 869         newMaskConst, "bfm_mask", CI);
 870   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
 871   CI->replaceAllUsesWith(lhs);
 872   return true;
 873 }
 874
 875 bool
 876 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
 877   Instruction *inst = (*bbb);
 878   if (optimizeCallInst(bbb)) {
 879     return true;
 880   }
 881   if (optimizeBitExtract(inst)) {
 882     return false;
 883   }
 884   if (optimizeBitInsert(inst)) {
 885     return false;
 886   }
 887   if (correctMisalignedMemOp(inst)) {
 888     return false;
 889   }
 890   return false;
 891 }
 892 bool
 893 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
 894   LoadInst *linst = dyn_cast<LoadInst>(inst);
 895   StoreInst *sinst = dyn_cast<StoreInst>(inst);
 896   unsigned alignment;
 897   Type* Ty = inst->getType();
 898   if (linst) {
 899     alignment = linst->getAlignment();
 900     Ty = inst->getType();
 901   } else if (sinst) {
 902     alignment = sinst->getAlignment();
 903     Ty = sinst->getValueOperand()->getType();
 904   } else {
 905     return false;
 906   }
 907   unsigned size = getTypeSize(Ty);
 908   if (size == alignment || size < alignment) {
 909     return false;
 910   }
 911   if (!Ty->isStructTy()) {
 912     return false;
 913   }
 914   if (alignment < 4) {
 915     if (linst) {
 916       linst->setAlignment(0);
 917       return true;
 918     } else if (sinst) {
 919       sinst->setAlignment(0);
 920       return true;
 921     }
 922   }
 923   return false;
 924 }
 925 bool
 926 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
 927   if (!CI) {
 928     return false;
 929   }
 930   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 931   std::string namePrefix = LHS->getName().substr(0, 14);
 932   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
 933       && namePrefix != "__amdil__imul24_high") {
 934     return false;
 935   }
 936   if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
 937     return false;
 938   }
 939   return true;
 940 }
 941
 942 void
 943 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
 944   assert(isSigned24BitOps(CI) && "Must be a "
 945       "signed 24 bit operation to call this function!");
 946   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
 947   // On 7XX and 8XX we do not have signed 24bit, so we need to
 948   // expand it to the following:
 949   // imul24 turns into 32bit imul
 950   // imad24 turns into 32bit imad
 951   // imul24_high turns into 32bit imulhigh
 952   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
 953     Type *aType = CI->getOperand(0)->getType();
 954     bool isVector = aType->isVectorTy();
 955     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
 956     std::vector<Type*> callTypes;
 957     callTypes.push_back(CI->getOperand(0)->getType());
 958     callTypes.push_back(CI->getOperand(1)->getType());
 959     callTypes.push_back(CI->getOperand(2)->getType());
 960     FunctionType *funcType =
 961       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
 962     std::string name = "__amdil_imad";
 963     if (isVector) {
 964       name += "_v" + itostr(numEle) + "i32";
 965     } else {
 966       name += "_i32";
 967     }
 968     Function *Func = dyn_cast<Function>(
 969                        CI->getParent()->getParent()->getParent()->
 970                        getOrInsertFunction(StringRef(name), funcType));
 971     Value *Operands[3] = {
 972       CI->getOperand(0),
 973       CI->getOperand(1),
 974       CI->getOperand(2)
 975     };
 976     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
 977     nCI->insertBefore(CI);
 978     CI->replaceAllUsesWith(nCI);
 979   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
 980     BinaryOperator *mulOp =
 981       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
 982           CI->getOperand(1), "imul24", CI);
 983     CI->replaceAllUsesWith(mulOp);
 984   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
 985     Type *aType = CI->getOperand(0)->getType();
 986
 987     bool isVector = aType->isVectorTy();
 988     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
 989     std::vector<Type*> callTypes;
 990     callTypes.push_back(CI->getOperand(0)->getType());
 991     callTypes.push_back(CI->getOperand(1)->getType());
 992     FunctionType *funcType =
 993       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
 994     std::string name = "__amdil_imul_high";
 995     if (isVector) {
 996       name += "_v" + itostr(numEle) + "i32";
 997     } else {
 998       name += "_i32";
 999     }
1000     Function *Func = dyn_cast<Function>(
1001                        CI->getParent()->getParent()->getParent()->
1002                        getOrInsertFunction(StringRef(name), funcType));
1003     Value *Operands[2] = {
1004       CI->getOperand(0),
1005       CI->getOperand(1)
1006     };
1007     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1008     nCI->insertBefore(CI);
1009     CI->replaceAllUsesWith(nCI);
1010   }
1011 }
1012
1013 bool
1014 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
1015   return (CI != NULL
1016           && CI->getOperand(CI->getNumOperands() - 1)->getName()
1017           == "__amdil_get_local_size_int");
1018 }
1019
1020 bool
1021 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
1022   if (!CI) {
1023     return false;
1024   }
1025   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1026       && (mSTM->getDeviceName() == "cayman")) {
1027     return false;
1028   }
1029   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1030       == "__amdil_improved_div";
1031 }
1032
1033 void
1034 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
1035   assert(convertAccurateDivide(CI)
1036          && "expanding accurate divide can only happen if it is expandable!");
1037   BinaryOperator *divOp =
1038     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1039                            CI->getOperand(1), "fdiv32", CI);
1040   CI->replaceAllUsesWith(divOp);
1041 }
1042
1043 bool
1044 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
1045   if (optLevel != CodeGenOpt::None) {
1046     return false;
1047   }
1048
1049   if (!CI) {
1050     return false;
1051   }
1052
1053   unsigned funcNameIdx = 0;
1054   funcNameIdx = CI->getNumOperands() - 1;
1055   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1056   if (calleeName != "__amdil_image2d_read_norm"
1057    && calleeName != "__amdil_image2d_read_unnorm"
1058    && calleeName != "__amdil_image3d_read_norm"
1059    && calleeName != "__amdil_image3d_read_unnorm") {
1060     return false;
1061   }
1062
1063   unsigned samplerIdx = 2;
1064   samplerIdx = 1;
1065   Value *sampler = CI->getOperand(samplerIdx);
1066   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1067   if (!lInst) {
1068     return false;
1069   }
1070
1071   if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1072     return false;
1073   }
1074
1075   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1076   // If we are loading from what is not a global value, then we
1077   // fail and return.
1078   if (!gv) {
1079     return false;
1080   }
1081
1082   // If we don't have an initializer or we have an initializer and
1083   // the initializer is not a 32bit integer, we fail.
1084   if (!gv->hasInitializer()
1085       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1086       return false;
1087   }
1088
1089   // Now that we have the global variable initializer, lets replace
1090   // all uses of the load instruction with the samplerVal and
1091   // reparse the __amdil_is_constant() function.
1092   Constant *samplerVal = gv->getInitializer();
1093   lInst->replaceAllUsesWith(samplerVal);
1094   return true;
1095 }
1096
1097 bool
1098 AMDGPUPeepholeOpt::doInitialization(Module &M)  {
1099   return false;
1100 }
1101
1102 bool
1103 AMDGPUPeepholeOpt::doFinalization(Module &M)  {
1104   return false;
1105 }
1106
1107 void
1108 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
1109   AU.addRequired<MachineFunctionAnalysis>();
1110   FunctionPass::getAnalysisUsage(AU);
1111   AU.setPreservesAll();
1112 }
1113
1114 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1115   size_t size = 0;
1116   if (!T) {
1117     return size;
1118   }
1119   switch (T->getTypeID()) {
1120   case Type::X86_FP80TyID:
1121   case Type::FP128TyID:
1122   case Type::PPC_FP128TyID:
1123   case Type::LabelTyID:
1124     assert(0 && "These types are not supported by this backend");
1125   default:
1126   case Type::FloatTyID:
1127   case Type::DoubleTyID:
1128     size = T->getPrimitiveSizeInBits() >> 3;
1129     break;
1130   case Type::PointerTyID:
1131     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1132     break;
1133   case Type::IntegerTyID:
1134     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1135     break;
1136   case Type::StructTyID:
1137     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1138     break;
1139   case Type::ArrayTyID:
1140     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1141     break;
1142   case Type::FunctionTyID:
1143     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1144     break;
1145   case Type::VectorTyID:
1146     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1147     break;
1148   };
1149   return size;
1150 }
1151
1152 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1153     bool dereferencePtr) {
1154   size_t size = 0;
1155   if (!ST) {
1156     return size;
1157   }
1158   Type *curType;
1159   StructType::element_iterator eib;
1160   StructType::element_iterator eie;
1161   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1162     curType = *eib;
1163     size += getTypeSize(curType, dereferencePtr);
1164   }
1165   return size;
1166 }
1167
1168 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1169     bool dereferencePtr) {
1170   return IT ? (IT->getBitWidth() >> 3) : 0;
1171 }
1172
1173 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1174     bool dereferencePtr) {
1175     assert(0 && "Should not be able to calculate the size of an function type");
1176     return 0;
1177 }
1178
1179 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1180     bool dereferencePtr) {
1181   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1182                                     dereferencePtr) * AT->getNumElements())
1183                      : 0);
1184 }
1185
1186 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1187     bool dereferencePtr) {
1188   return VT ? (VT->getBitWidth() >> 3) : 0;
1189 }
1190
1191 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1192     bool dereferencePtr) {
1193   if (!PT) {
1194     return 0;
1195   }
1196   Type *CT = PT->getElementType();
1197   if (CT->getTypeID() == Type::StructTyID &&
1198       PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1199     return getTypeSize(dyn_cast<StructType>(CT));
1200   } else if (dereferencePtr) {
1201     size_t size = 0;
1202     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1203       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1204     }
1205     return size;
1206   } else {
1207     return 4;
1208   }
1209 }
1210
1211 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1212     bool dereferencePtr) {
1213   //assert(0 && "Should not be able to calculate the size of an opaque type");
1214   return 4;
1215 }