lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

   1 //===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the visit functions for load, store and alloca.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "InstCombine.h"
  15 #include "llvm/ADT/Statistic.h"
  16 #include "llvm/Analysis/Loads.h"
  17 #include "llvm/IR/DataLayout.h"
  18 #include "llvm/IR/IntrinsicInst.h"
  19 #include "llvm/Support/CommandLine.h"
  20 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  21 #include "llvm/Transforms/Utils/Local.h"
  22 using namespace llvm;
  23
  24 /// Hidden option to stress test load slicing, i.e., when this option
  25 /// is enabled, load slicing bypasses most of its profitability guards.
  26 /// It will also generate, uncanonalized form of slicing.
  27 static cl::opt<bool>
  28 StressLoadSlicing("instcombine-stress-load-slicing", cl::Hidden,
  29                   cl::desc("Bypass the profitability model of load "
  30                            "slicing"),
  31                   cl::init(false));
  32
  33 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
  34 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
  35
  36 /// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
  37 /// some part of a constant global variable.  This intentionally only accepts
  38 /// constant expressions because we can't rewrite arbitrary instructions.
  39 static bool pointsToConstantGlobal(Value *V) {
  40   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
  41     return GV->isConstant();
  42   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
  43     if (CE->getOpcode() == Instruction::BitCast ||
  44         CE->getOpcode() == Instruction::GetElementPtr)
  45       return pointsToConstantGlobal(CE->getOperand(0));
  46   return false;
  47 }
  48
  49 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
  50 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
  51 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
  52 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
  53 /// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
  54 /// the alloca, and if the source pointer is a pointer to a constant global, we
  55 /// can optimize this.
  56 static bool
  57 isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
  58                                SmallVectorImpl<Instruction *> &ToDelete,
  59                                bool IsOffset = false) {
  60   // We track lifetime intrinsics as we encounter them.  If we decide to go
  61   // ahead and replace the value with the global, this lets the caller quickly
  62   // eliminate the markers.
  63
  64   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
  65     User *U = cast<Instruction>(*UI);
  66
  67     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
  68       // Ignore non-volatile loads, they are always ok.
  69       if (!LI->isSimple()) return false;
  70       continue;
  71     }
  72
  73     if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
  74       // If uses of the bitcast are ok, we are ok.
  75       if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, ToDelete, IsOffset))
  76         return false;
  77       continue;
  78     }
  79     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
  80       // If the GEP has all zero indices, it doesn't offset the pointer.  If it
  81       // doesn't, it does.
  82       if (!isOnlyCopiedFromConstantGlobal(
  83               GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices()))
  84         return false;
  85       continue;
  86     }
  87
  88     if (CallSite CS = U) {
  89       // If this is the function being called then we treat it like a load and
  90       // ignore it.
  91       if (CS.isCallee(UI))
  92         continue;
  93
  94       // If this is a readonly/readnone call site, then we know it is just a
  95       // load (but one that potentially returns the value itself), so we can
  96       // ignore it if we know that the value isn't captured.
  97       unsigned ArgNo = CS.getArgumentNo(UI);
  98       if (CS.onlyReadsMemory() &&
  99           (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
 100         continue;
 101
 102       // If this is being passed as a byval argument, the caller is making a
 103       // copy, so it is only a read of the alloca.
 104       if (CS.isByValArgument(ArgNo))
 105         continue;
 106     }
 107
 108     // Lifetime intrinsics can be handled by the caller.
 109     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
 110       if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
 111           II->getIntrinsicID() == Intrinsic::lifetime_end) {
 112         assert(II->use_empty() && "Lifetime markers have no result to use!");
 113         ToDelete.push_back(II);
 114         continue;
 115       }
 116     }
 117
 118     // If this is isn't our memcpy/memmove, reject it as something we can't
 119     // handle.
 120     MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
 121     if (MI == 0)
 122       return false;
 123
 124     // If the transfer is using the alloca as a source of the transfer, then
 125     // ignore it since it is a load (unless the transfer is volatile).
 126     if (UI.getOperandNo() == 1) {
 127       if (MI->isVolatile()) return false;
 128       continue;
 129     }
 130
 131     // If we already have seen a copy, reject the second one.
 132     if (TheCopy) return false;
 133
 134     // If the pointer has been offset from the start of the alloca, we can't
 135     // safely handle this.
 136     if (IsOffset) return false;
 137
 138     // If the memintrinsic isn't using the alloca as the dest, reject it.
 139     if (UI.getOperandNo() != 0) return false;
 140
 141     // If the source of the memcpy/move is not a constant global, reject it.
 142     if (!pointsToConstantGlobal(MI->getSource()))
 143       return false;
 144
 145     // Otherwise, the transform is safe.  Remember the copy instruction.
 146     TheCopy = MI;
 147   }
 148   return true;
 149 }
 150
 151 /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
 152 /// modified by a copy from a constant global.  If we can prove this, we can
 153 /// replace any uses of the alloca with uses of the global directly.
 154 static MemTransferInst *
 155 isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
 156                                SmallVectorImpl<Instruction *> &ToDelete) {
 157   MemTransferInst *TheCopy = 0;
 158   if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
 159     return TheCopy;
 160   return 0;
 161 }
 162
 163 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
 164   // Ensure that the alloca array size argument has type intptr_t, so that
 165   // any casting is exposed early.
 166   if (TD) {
 167     Type *IntPtrTy = TD->getIntPtrType(AI.getType());
 168     if (AI.getArraySize()->getType() != IntPtrTy) {
 169       Value *V = Builder->CreateIntCast(AI.getArraySize(),
 170                                         IntPtrTy, false);
 171       AI.setOperand(0, V);
 172       return &AI;
 173     }
 174   }
 175
 176   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
 177   if (AI.isArrayAllocation()) {  // Check C != 1
 178     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
 179       Type *NewTy =
 180         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
 181       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
 182       New->setAlignment(AI.getAlignment());
 183
 184       // Scan to the end of the allocation instructions, to skip over a block of
 185       // allocas if possible...also skip interleaved debug info
 186       //
 187       BasicBlock::iterator It = New;
 188       while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It;
 189
 190       // Now that I is pointing to the first non-allocation-inst in the block,
 191       // insert our getelementptr instruction...
 192       //
 193       Type *IdxTy = TD
 194                   ? TD->getIntPtrType(AI.getType())
 195                   : Type::getInt64Ty(AI.getContext());
 196       Value *NullIdx = Constant::getNullValue(IdxTy);
 197       Value *Idx[2] = { NullIdx, NullIdx };
 198       Instruction *GEP =
 199         GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
 200       InsertNewInstBefore(GEP, *It);
 201
 202       // Now make everything use the getelementptr instead of the original
 203       // allocation.
 204       return ReplaceInstUsesWith(AI, GEP);
 205     } else if (isa<UndefValue>(AI.getArraySize())) {
 206       return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
 207     }
 208   }
 209
 210   if (TD && AI.getAllocatedType()->isSized()) {
 211     // If the alignment is 0 (unspecified), assign it the preferred alignment.
 212     if (AI.getAlignment() == 0)
 213       AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
 214
 215     // Move all alloca's of zero byte objects to the entry block and merge them
 216     // together.  Note that we only do this for alloca's, because malloc should
 217     // allocate and return a unique pointer, even for a zero byte allocation.
 218     if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) {
 219       // For a zero sized alloca there is no point in doing an array allocation.
 220       // This is helpful if the array size is a complicated expression not used
 221       // elsewhere.
 222       if (AI.isArrayAllocation()) {
 223         AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
 224         return &AI;
 225       }
 226
 227       // Get the first instruction in the entry block.
 228       BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
 229       Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
 230       if (FirstInst != &AI) {
 231         // If the entry block doesn't start with a zero-size alloca then move
 232         // this one to the start of the entry block.  There is no problem with
 233         // dominance as the array size was forced to a constant earlier already.
 234         AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
 235         if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
 236             TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
 237           AI.moveBefore(FirstInst);
 238           return &AI;
 239         }
 240
 241         // If the alignment of the entry block alloca is 0 (unspecified),
 242         // assign it the preferred alignment.
 243         if (EntryAI->getAlignment() == 0)
 244           EntryAI->setAlignment(
 245             TD->getPrefTypeAlignment(EntryAI->getAllocatedType()));
 246         // Replace this zero-sized alloca with the one at the start of the entry
 247         // block after ensuring that the address will be aligned enough for both
 248         // types.
 249         unsigned MaxAlign = std::max(EntryAI->getAlignment(),
 250                                      AI.getAlignment());
 251         EntryAI->setAlignment(MaxAlign);
 252         if (AI.getType() != EntryAI->getType())
 253           return new BitCastInst(EntryAI, AI.getType());
 254         return ReplaceInstUsesWith(AI, EntryAI);
 255       }
 256     }
 257   }
 258
 259   if (AI.getAlignment()) {
 260     // Check to see if this allocation is only modified by a memcpy/memmove from
 261     // a constant global whose alignment is equal to or exceeds that of the
 262     // allocation.  If this is the case, we can change all users to use
 263     // the constant global instead.  This is commonly produced by the CFE by
 264     // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
 265     // is only subsequently read.
 266     SmallVector<Instruction *, 4> ToDelete;
 267     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
 268       unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(),
 269                                                         AI.getAlignment(), TD);
 270       if (AI.getAlignment() <= SourceAlign) {
 271         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
 272         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
 273         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
 274           EraseInstFromFunction(*ToDelete[i]);
 275         Constant *TheSrc = cast<Constant>(Copy->getSource());
 276         Instruction *NewI
 277           = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
 278                                                              AI.getType()));
 279         EraseInstFromFunction(*Copy);
 280         ++NumGlobalCopies;
 281         return NewI;
 282       }
 283     }
 284   }
 285
 286   // At last, use the generic allocation site handler to aggressively remove
 287   // unused allocas.
 288   return visitAllocSite(AI);
 289 }
 290
 291
 292 /// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
 293 static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
 294                                         const DataLayout *TD) {
 295   User *CI = cast<User>(LI.getOperand(0));
 296   Value *CastOp = CI->getOperand(0);
 297
 298   PointerType *DestTy = cast<PointerType>(CI->getType());
 299   Type *DestPTy = DestTy->getElementType();
 300   if (PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
 301
 302     // If the address spaces don't match, don't eliminate the cast.
 303     if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
 304       return 0;
 305
 306     Type *SrcPTy = SrcTy->getElementType();
 307
 308     if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() ||
 309          DestPTy->isVectorTy()) {
 310       // If the source is an array, the code below will not succeed.  Check to
 311       // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
 312       // constants.
 313       if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
 314         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
 315           if (ASrcTy->getNumElements() != 0) {
 316             Type *IdxTy = TD
 317                         ? TD->getIntPtrType(SrcTy)
 318                         : Type::getInt64Ty(SrcTy->getContext());
 319             Value *Idx = Constant::getNullValue(IdxTy);
 320             Value *Idxs[2] = { Idx, Idx };
 321             CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
 322             SrcTy = cast<PointerType>(CastOp->getType());
 323             SrcPTy = SrcTy->getElementType();
 324           }
 325
 326       if (IC.getDataLayout() &&
 327           (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() ||
 328             SrcPTy->isVectorTy()) &&
 329           // Do not allow turning this into a load of an integer, which is then
 330           // casted to a pointer, this pessimizes pointer analysis a lot.
 331           (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
 332           IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
 333                IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
 334
 335         // Okay, we are casting from one integer or pointer type to another of
 336         // the same size.  Instead of casting the pointer before the load, cast
 337         // the result of the loaded value.
 338         LoadInst *NewLoad =
 339           IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
 340         NewLoad->setAlignment(LI.getAlignment());
 341         NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
 342         // Now cast the result of the load.
 343         return new BitCastInst(NewLoad, LI.getType());
 344       }
 345     }
 346   }
 347   return 0;
 348 }
 349
 350 namespace {
 351   /// \brief Helper structure used to slice a load in smaller loads.
 352   struct LoadedSlice {
 353     // The last instruction that represent the slice. This should be a
 354     // truncate instruction.
 355     Instruction *Inst;
 356     // The original load instruction.
 357     LoadInst *Origin;
 358     // The right shift amount in bits from the original load.
 359     unsigned Shift;
 360
 361     LoadedSlice(Instruction *Inst = NULL, LoadInst *Origin = NULL,
 362                 unsigned Shift = 0)
 363     : Inst(Inst), Origin(Origin), Shift(Shift) {}
 364
 365     LoadedSlice(const LoadedSlice& LS) : Inst(LS.Inst), Origin(LS.Origin),
 366       Shift(LS.Shift) {}
 367
 368     /// \brief Get the bits used in a chunk of bits \p BitWidth large.
 369     /// \return Result is \p BitWidth and has used bits set to 1 and
 370     ///         not used bits set to 0.
 371     APInt getUsedBits() const {
 372       // Reproduce the trunc(lshr) sequence:
 373       // - Start from the truncated value.
 374       // - Zero extend to the desired bit width.
 375       // - Shift left.
 376       assert(Origin && "No original load to compare against.");
 377       unsigned BitWidth = Origin->getType()->getPrimitiveSizeInBits();
 378       assert(Inst && "This slice is not bound to an instruction");
 379       assert(Inst->getType()->getPrimitiveSizeInBits() <= BitWidth &&
 380              "Extracted slice is smaller than the whole type!");
 381       APInt UsedBits(Inst->getType()->getPrimitiveSizeInBits(), 0);
 382       UsedBits.setAllBits();
 383       UsedBits = UsedBits.zext(BitWidth);
 384       UsedBits <<= Shift;
 385       return UsedBits;
 386     }
 387
 388     /// \brief Get the size of the slice to be loaded in bytes.
 389     unsigned getLoadedSize() const {
 390       unsigned SliceSize = getUsedBits().countPopulation();
 391       assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
 392       return SliceSize / 8;
 393     }
 394
 395     /// \brief Get the offset in bytes of this slice in the original chunk of
 396     /// bits, whose layout is defined by \p IsBigEndian.
 397     uint64_t getOffsetFromBase(bool IsBigEndian) const {
 398       assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not support.");
 399       uint64_t Offset = Shift / 8;
 400       unsigned TySizeInBytes = Origin->getType()->getPrimitiveSizeInBits() / 8;
 401       assert(!(Origin->getType()->getPrimitiveSizeInBits() & 0x7) &&
 402              "The size of the original loaded type is not a multiple of a"
 403              " byte.");
 404       // If Offset is bigger than TySizeInBytes, it means we are loading all
 405       // zeros. This should have been optimized before in the process.
 406       assert(TySizeInBytes > Offset &&
 407              "Invalid shift amount for given loaded size");
 408       if (IsBigEndian)
 409         Offset = TySizeInBytes - Offset - getLoadedSize();
 410       return Offset;
 411     }
 412
 413     /// \brief Generate the sequence of instructions to load the slice
 414     /// represented by this object and redirect the uses of this slice to
 415     /// this new sequence of instructions.
 416     /// \pre this->Inst && this->Origin are valid Instructions.
 417     /// \return The last instruction of the sequence used to load the slice.
 418     Instruction *loadSlice(InstCombiner::BuilderTy &Builder,
 419                            bool IsBigEndian) const {
 420       assert(Inst && Origin && "Unable to replace a non-existing slice.");
 421       Value *BaseAddr = Origin->getOperand(0);
 422       unsigned Alignment = Origin->getAlignment();
 423       Builder.SetInsertPoint(Origin);
 424       // Assume we are looking at a chunk of bytes.
 425       // BaseAddr = (i8*)BaseAddr.
 426       BaseAddr = Builder.CreateBitCast(BaseAddr, Builder.getInt8PtrTy(),
 427                                        "raw_cast");
 428       // Get the offset in that chunk of bytes w.r.t. the endianess.
 429       uint64_t Offset = getOffsetFromBase(IsBigEndian);
 430       if (Offset) {
 431         APInt APOffset(64, Offset);
 432         // BaseAddr = BaseAddr + Offset.
 433         BaseAddr = Builder.CreateInBoundsGEP(BaseAddr, Builder.getInt(APOffset),
 434                                              "raw_idx");
 435       }
 436
 437       // Create the type of the loaded slice according to its size.
 438       Type *SliceType =
 439         Type::getIntNTy(Origin->getContext(), getLoadedSize() * 8);
 440
 441       // Bit cast the raw pointer to the pointer type of the slice.
 442       BaseAddr = Builder.CreateBitCast(BaseAddr, SliceType->getPointerTo(),
 443                                        "cast");
 444
 445       // Compute the new alignment.
 446       if (Offset != 0)
 447         Alignment = MinAlign(Alignment, Alignment + Offset);
 448
 449       // Create the load for the slice.
 450       Instruction *LastInst = Builder.CreateAlignedLoad(BaseAddr, Alignment,
 451                                                         Inst->getName()+".val");
 452       // If the final type is not the same as the loaded type, this means that
 453       // we have to pad with zero. Create a zero extend for that.
 454       Type * FinalType = Inst->getType();
 455       if (SliceType != FinalType)
 456         LastInst = cast<Instruction>(Builder.CreateZExt(LastInst, FinalType));
 457
 458       // Update the IR to reflect the new access to the slice.
 459       Inst->replaceAllUsesWith(LastInst);
 460
 461       return LastInst;
 462     }
 463
 464     /// \brief Check if it would be profitable to expand this slice as an
 465     /// independant load.
 466     bool isProfitable() const {
 467       // Slicing is assumed to be profitable iff the chains leads to arithmetic
 468       // operations.
 469       SmallVector<const Instruction *, 8> Uses;
 470       Uses.push_back(Inst);
 471       do {
 472         const Instruction *Use = Uses.pop_back_val();
 473         for (Value::const_use_iterator UseIt = Use->use_begin(),
 474              UseItEnd = Use->use_end(); UseIt != UseItEnd; ++UseIt) {
 475           const Instruction *UseOfUse = cast<Instruction>(*UseIt);
 476           // Consider these instructions as arithmetic operations.
 477           if (isa<BinaryOperator>(UseOfUse) ||
 478               isa<CastInst>(UseOfUse) ||
 479               isa<PHINode>(UseOfUse) ||
 480               isa<GetElementPtrInst>(UseOfUse))
 481             return true;
 482           // No need to check if the Use has already been checked as we do not
 483           // insert any PHINode.
 484           Uses.push_back(UseOfUse);
 485         }
 486       } while (!Uses.empty());
 487       DEBUG(dbgs() << "IC: Not a profitable slice " << *Inst << '\n');
 488       return false;
 489     }
 490   };
 491 }
 492
 493 /// \brief Check the profitability of all involved LoadedSlice.
 494 /// Unless StressLoadSlicing is specified, this also returns false
 495 /// when slicing is not in the canonical form.
 496 /// The canonical form of sliced load is (1) two loads,
 497 /// which are (2) next to each other in memory.
 498 ///
 499 /// FIXME: We may want to allow more slices to be created but
 500 /// this means other passes should know how to deal with all those
 501 /// slices.
 502 /// FIXME: We may want to split loads to different types, e.g.,
 503 /// int vs. float.
 504 static bool
 505 isSlicingProfitable(const SmallVectorImpl<LoadedSlice> &LoadedSlices,
 506                     const APInt &UsedBits) {
 507   unsigned NbOfSlices = LoadedSlices.size();
 508   // Check (1).
 509   if (!StressLoadSlicing && NbOfSlices != 2)
 510     return false;
 511
 512   // Check (2).
 513   if (!StressLoadSlicing && !UsedBits.isAllOnesValue()) {
 514     // Get rid of the unused bits on the right.
 515     APInt MemoryLayout = UsedBits.lshr(UsedBits.countTrailingZeros());
 516     // Get rid of the unused bits on the left.
 517     if (MemoryLayout.countLeadingZeros())
 518       MemoryLayout = MemoryLayout.trunc(MemoryLayout.getActiveBits());
 519     // Check that the chunk of memory is completely used.
 520     if (!MemoryLayout.isAllOnesValue())
 521       return false;
 522   }
 523
 524   unsigned NbOfProfitableSlices = 0;
 525   for (unsigned CurrSlice = 0; CurrSlice < NbOfSlices; ++CurrSlice) {
 526     if (LoadedSlices[CurrSlice].isProfitable())
 527       ++NbOfProfitableSlices;
 528     else if (!StressLoadSlicing)
 529       return false;
 530   }
 531   // In Stress mode, we may have 0 profitable slice.
 532   // Check that here.
 533   // In non-Stress mode, all the slices are profitable at this point.
 534   return NbOfProfitableSlices > 0;
 535 }
 536
 537 /// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
 538 /// operations, split it in the various pieces being extracted.
 539 ///
 540 /// This sort of thing is introduced by SROA.
 541 /// This slicing takes care not to insert overlapping loads.
 542 /// \pre LI is a simple load (i.e., not an atomic or volatile load).
 543 static Instruction *sliceUpLoadInst(LoadInst &LI,
 544                                     InstCombiner::BuilderTy &Builder,
 545                                     DataLayout &TD) {
 546   assert(LI.isSimple() && "We are trying to transform a non-simple load!");
 547
 548   // FIXME: If we want to support floating point and vector types, we should
 549   // support bitcast and extract/insert element instructions.
 550   Type *LITy = LI.getType();
 551   if (!LITy->isIntegerTy()) return 0;
 552
 553   // Keep track of already used bits to detect overlapping values.
 554   // In that case, we will just abort the transformation.
 555   APInt UsedBits(LITy->getPrimitiveSizeInBits(), 0);
 556
 557   SmallVector<LoadedSlice, 4> LoadedSlices;
 558
 559   // Check if this load is used as several smaller chunks of bits.
 560   // Basically, look for uses in trunc or trunc(lshr) and record a new chain
 561   // of computation for each trunc.
 562   for (Value::use_iterator UI = LI.use_begin(), UIEnd = LI.use_end();
 563        UI != UIEnd; ++UI) {
 564     Instruction *User = cast<Instruction>(*UI);
 565     unsigned Shift = 0;
 566
 567     // Check if this is a trunc(lshr).
 568     if (User->getOpcode() == Instruction::LShr && User->hasOneUse() &&
 569         isa<ConstantInt>(User->getOperand(1))) {
 570       Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
 571       User = User->use_back();
 572     }
 573
 574     // At this point, User is a TruncInst, iff we encountered, trunc or
 575     // trunc(lshr).
 576     if (!isa<TruncInst>(User))
 577       return 0;
 578
 579     // The width of the type must be a power of 2 and greater than 8-bits.
 580     // Otherwise the load cannot be represented in LLVM IR.
 581     // Moreover, if we shifted with a non 8-bits multiple, the slice
 582     // will be accross several bytes. We do not support that.
 583     unsigned Width = User->getType()->getPrimitiveSizeInBits();
 584     if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
 585       return 0;
 586
 587     // Build the slice for this chain of computations.
 588     LoadedSlice LS(User, &LI, Shift);
 589     APInt CurrentUsedBits = LS.getUsedBits();
 590
 591     // Check if this slice overlaps with another.
 592     if ((CurrentUsedBits & UsedBits) != 0)
 593       return 0;
 594     // Update the bits used globally.
 595     UsedBits |= CurrentUsedBits;
 596
 597     // Record the slice.
 598     LoadedSlices.push_back(LS);
 599   }
 600
 601   // Abort slicing if it does not seem to be profitable.
 602   if (!isSlicingProfitable(LoadedSlices, UsedBits))
 603     return 0;
 604
 605   // Rewrite each chain to use an independent load.
 606   // By construction, each chain can be represented by a unique load.
 607   bool IsBigEndian = TD.isBigEndian();
 608   for (SmallVectorImpl<LoadedSlice>::const_iterator LSIt = LoadedSlices.begin(),
 609        LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) {
 610     Instruction *SliceInst = LSIt->loadSlice(Builder, IsBigEndian);
 611     (void)SliceInst;
 612     DEBUG(dbgs() << "IC: Replacing " << *LSIt->Inst << "\n"
 613                     "    with " << *SliceInst << '\n');
 614   }
 615   return 0; // Don't do anything with LI.
 616 }
 617
 618 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 619   Value *Op = LI.getOperand(0);
 620
 621   // Attempt to improve the alignment.
 622   if (TD) {
 623     unsigned KnownAlign =
 624       getOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()),TD);
 625     unsigned LoadAlign = LI.getAlignment();
 626     unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
 627       TD->getABITypeAlignment(LI.getType());
 628
 629     if (KnownAlign > EffectiveLoadAlign)
 630       LI.setAlignment(KnownAlign);
 631     else if (LoadAlign == 0)
 632       LI.setAlignment(EffectiveLoadAlign);
 633   }
 634
 635   // load (cast X) --> cast (load X) iff safe.
 636   if (isa<CastInst>(Op))
 637     if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
 638       return Res;
 639
 640   // None of the following transforms are legal for volatile/atomic loads.
 641   // FIXME: Some of it is okay for atomic loads; needs refactoring.
 642   if (!LI.isSimple()) return 0;
 643
 644   // Do really simple store-to-load forwarding and load CSE, to catch cases
 645   // where there are several consecutive memory accesses to the same location,
 646   // separated by a few arithmetic operations.
 647   BasicBlock::iterator BBI = &LI;
 648   if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
 649     return ReplaceInstUsesWith(LI, AvailableVal);
 650
 651   // load(gep null, ...) -> unreachable
 652   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
 653     const Value *GEPI0 = GEPI->getOperand(0);
 654     // TODO: Consider a target hook for valid address spaces for this xform.
 655     if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0){
 656       // Insert a new store to null instruction before the load to indicate
 657       // that this code is not reachable.  We do this instead of inserting
 658       // an unreachable instruction directly because we cannot modify the
 659       // CFG.
 660       new StoreInst(UndefValue::get(LI.getType()),
 661                     Constant::getNullValue(Op->getType()), &LI);
 662       return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
 663     }
 664   }
 665
 666   // load null/undef -> unreachable
 667   // TODO: Consider a target hook for valid address spaces for this xform.
 668   if (isa<UndefValue>(Op) ||
 669       (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0)) {
 670     // Insert a new store to null instruction before the load to indicate that
 671     // this code is not reachable.  We do this instead of inserting an
 672     // unreachable instruction directly because we cannot modify the CFG.
 673     new StoreInst(UndefValue::get(LI.getType()),
 674                   Constant::getNullValue(Op->getType()), &LI);
 675     return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
 676   }
 677
 678   // Instcombine load (constantexpr_cast global) -> cast (load global)
 679   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op))
 680     if (CE->isCast())
 681       if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
 682         return Res;
 683
 684   if (Op->hasOneUse()) {
 685     // Change select and PHI nodes to select values instead of addresses: this
 686     // helps alias analysis out a lot, allows many others simplifications, and
 687     // exposes redundancy in the code.
 688     //
 689     // Note that we cannot do the transformation unless we know that the
 690     // introduced loads cannot trap!  Something like this is valid as long as
 691     // the condition is always false: load (select bool %C, int* null, int* %G),
 692     // but it would not be valid if we transformed it to load from null
 693     // unconditionally.
 694     //
 695     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
 696       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
 697       unsigned Align = LI.getAlignment();
 698       if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, TD) &&
 699           isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, TD)) {
 700         LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1),
 701                                            SI->getOperand(1)->getName()+".val");
 702         LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2),
 703                                            SI->getOperand(2)->getName()+".val");
 704         V1->setAlignment(Align);
 705         V2->setAlignment(Align);
 706         return SelectInst::Create(SI->getCondition(), V1, V2);
 707       }
 708
 709       // load (select (cond, null, P)) -> load P
 710       if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
 711         if (C->isNullValue()) {
 712           LI.setOperand(0, SI->getOperand(2));
 713           return &LI;
 714         }
 715
 716       // load (select (cond, P, null)) -> load P
 717       if (Constant *C = dyn_cast<Constant>(SI->getOperand(2)))
 718         if (C->isNullValue()) {
 719           LI.setOperand(0, SI->getOperand(1));
 720           return &LI;
 721         }
 722     }
 723   }
 724
 725   // Try to split a load in smaller non-overlapping loads to expose independant
 726   // chain of computations and get rid of trunc/lshr sequence of code.
 727   // The data layout is required for that operation, as code generation will
 728   // change with respect to endianess.
 729   if (TD)
 730     return sliceUpLoadInst(LI, *Builder, *TD);
 731   return 0;
 732 }
 733
 734 /// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
 735 /// when possible.  This makes it generally easy to do alias analysis and/or
 736 /// SROA/mem2reg of the memory object.
 737 static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
 738   User *CI = cast<User>(SI.getOperand(1));
 739   Value *CastOp = CI->getOperand(0);
 740
 741   Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
 742   PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
 743   if (SrcTy == 0) return 0;
 744
 745   Type *SrcPTy = SrcTy->getElementType();
 746
 747   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
 748     return 0;
 749
 750   /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
 751   /// to its first element.  This allows us to handle things like:
 752   ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
 753   /// on 32-bit hosts.
 754   SmallVector<Value*, 4> NewGEPIndices;
 755
 756   // If the source is an array, the code below will not succeed.  Check to
 757   // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
 758   // constants.
 759   if (SrcPTy->isArrayTy() || SrcPTy->isStructTy()) {
 760     // Index through pointer.
 761     Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext()));
 762     NewGEPIndices.push_back(Zero);
 763
 764     while (1) {
 765       if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
 766         if (!STy->getNumElements()) /* Struct can be empty {} */
 767           break;
 768         NewGEPIndices.push_back(Zero);
 769         SrcPTy = STy->getElementType(0);
 770       } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
 771         NewGEPIndices.push_back(Zero);
 772         SrcPTy = ATy->getElementType();
 773       } else {
 774         break;
 775       }
 776     }
 777
 778     SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
 779   }
 780
 781   if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
 782     return 0;
 783
 784   // If the pointers point into different address spaces or if they point to
 785   // values with different sizes, we can't do the transformation.
 786   if (!IC.getDataLayout() ||
 787       SrcTy->getAddressSpace() !=
 788         cast<PointerType>(CI->getType())->getAddressSpace() ||
 789       IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
 790       IC.getDataLayout()->getTypeSizeInBits(DestPTy))
 791     return 0;
 792
 793   // Okay, we are casting from one integer or pointer type to another of
 794   // the same size.  Instead of casting the pointer before
 795   // the store, cast the value to be stored.
 796   Value *NewCast;
 797   Value *SIOp0 = SI.getOperand(0);
 798   Instruction::CastOps opcode = Instruction::BitCast;
 799   Type* CastSrcTy = SIOp0->getType();
 800   Type* CastDstTy = SrcPTy;
 801   if (CastDstTy->isPointerTy()) {
 802     if (CastSrcTy->isIntegerTy())
 803       opcode = Instruction::IntToPtr;
 804   } else if (CastDstTy->isIntegerTy()) {
 805     if (SIOp0->getType()->isPointerTy())
 806       opcode = Instruction::PtrToInt;
 807   }
 808
 809   // SIOp0 is a pointer to aggregate and this is a store to the first field,
 810   // emit a GEP to index into its first field.
 811   if (!NewGEPIndices.empty())
 812     CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
 813
 814   NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
 815                                    SIOp0->getName()+".c");
 816   SI.setOperand(0, NewCast);
 817   SI.setOperand(1, CastOp);
 818   return &SI;
 819 }
 820
 821 /// equivalentAddressValues - Test if A and B will obviously have the same
 822 /// value. This includes recognizing that %t0 and %t1 will have the same
 823 /// value in code like this:
 824 ///   %t0 = getelementptr \@a, 0, 3
 825 ///   store i32 0, i32* %t0
 826 ///   %t1 = getelementptr \@a, 0, 3
 827 ///   %t2 = load i32* %t1
 828 ///
 829 static bool equivalentAddressValues(Value *A, Value *B) {
 830   // Test if the values are trivially equivalent.
 831   if (A == B) return true;
 832
 833   // Test if the values come form identical arithmetic instructions.
 834   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
 835   // its only used to compare two uses within the same basic block, which
 836   // means that they'll always either have the same value or one of them
 837   // will have an undefined value.
 838   if (isa<BinaryOperator>(A) ||
 839       isa<CastInst>(A) ||
 840       isa<PHINode>(A) ||
 841       isa<GetElementPtrInst>(A))
 842     if (Instruction *BI = dyn_cast<Instruction>(B))
 843       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
 844         return true;
 845
 846   // Otherwise they may not be equivalent.
 847   return false;
 848 }
 849
 850 Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 851   Value *Val = SI.getOperand(0);
 852   Value *Ptr = SI.getOperand(1);
 853
 854   // Attempt to improve the alignment.
 855   if (TD) {
 856     unsigned KnownAlign =
 857       getOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()),
 858                                  TD);
 859     unsigned StoreAlign = SI.getAlignment();
 860     unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
 861       TD->getABITypeAlignment(Val->getType());
 862
 863     if (KnownAlign > EffectiveStoreAlign)
 864       SI.setAlignment(KnownAlign);
 865     else if (StoreAlign == 0)
 866       SI.setAlignment(EffectiveStoreAlign);
 867   }
 868
 869   // Don't hack volatile/atomic stores.
 870   // FIXME: Some bits are legal for atomic stores; needs refactoring.
 871   if (!SI.isSimple()) return 0;
 872
 873   // If the RHS is an alloca with a single use, zapify the store, making the
 874   // alloca dead.
 875   if (Ptr->hasOneUse()) {
 876     if (isa<AllocaInst>(Ptr))
 877       return EraseInstFromFunction(SI);
 878     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
 879       if (isa<AllocaInst>(GEP->getOperand(0))) {
 880         if (GEP->getOperand(0)->hasOneUse())
 881           return EraseInstFromFunction(SI);
 882       }
 883     }
 884   }
 885
 886   // Do really simple DSE, to catch cases where there are several consecutive
 887   // stores to the same location, separated by a few arithmetic operations. This
 888   // situation often occurs with bitfield accesses.
 889   BasicBlock::iterator BBI = &SI;
 890   for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
 891        --ScanInsts) {
 892     --BBI;
 893     // Don't count debug info directives, lest they affect codegen,
 894     // and we skip pointer-to-pointer bitcasts, which are NOPs.
 895     if (isa<DbgInfoIntrinsic>(BBI) ||
 896         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
 897       ScanInsts++;
 898       continue;
 899     }
 900
 901     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
 902       // Prev store isn't volatile, and stores to the same location?
 903       if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
 904                                                         SI.getOperand(1))) {
 905         ++NumDeadStore;
 906         ++BBI;
 907         EraseInstFromFunction(*PrevSI);
 908         continue;
 909       }
 910       break;
 911     }
 912
 913     // If this is a load, we have to stop.  However, if the loaded value is from
 914     // the pointer we're loading and is producing the pointer we're storing,
 915     // then *this* store is dead (X = load P; store X -> P).
 916     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
 917       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
 918           LI->isSimple())
 919         return EraseInstFromFunction(SI);
 920
 921       // Otherwise, this is a load from some other location.  Stores before it
 922       // may not be dead.
 923       break;
 924     }
 925
 926     // Don't skip over loads or things that can modify memory.
 927     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
 928       break;
 929   }
 930
 931   // store X, null    -> turns into 'unreachable' in SimplifyCFG
 932   if (isa<ConstantPointerNull>(Ptr) && SI.getPointerAddressSpace() == 0) {
 933     if (!isa<UndefValue>(Val)) {
 934       SI.setOperand(0, UndefValue::get(Val->getType()));
 935       if (Instruction *U = dyn_cast<Instruction>(Val))
 936         Worklist.Add(U);  // Dropped a use.
 937     }
 938     return 0;  // Do not modify these!
 939   }
 940
 941   // store undef, Ptr -> noop
 942   if (isa<UndefValue>(Val))
 943     return EraseInstFromFunction(SI);
 944
 945   // If the pointer destination is a cast, see if we can fold the cast into the
 946   // source instead.
 947   if (isa<CastInst>(Ptr))
 948     if (Instruction *Res = InstCombineStoreToCast(*this, SI))
 949       return Res;
 950   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
 951     if (CE->isCast())
 952       if (Instruction *Res = InstCombineStoreToCast(*this, SI))
 953         return Res;
 954
 955
 956   // If this store is the last instruction in the basic block (possibly
 957   // excepting debug info instructions), and if the block ends with an
 958   // unconditional branch, try to move it to the successor block.
 959   BBI = &SI;
 960   do {
 961     ++BBI;
 962   } while (isa<DbgInfoIntrinsic>(BBI) ||
 963            (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
 964   if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
 965     if (BI->isUnconditional())
 966       if (SimplifyStoreAtEndOfBlock(SI))
 967         return 0;  // xform done!
 968
 969   return 0;
 970 }
 971
 972 /// SimplifyStoreAtEndOfBlock - Turn things like:
 973 ///   if () { *P = v1; } else { *P = v2 }
 974 /// into a phi node with a store in the successor.
 975 ///
 976 /// Simplify things like:
 977 ///   *P = v1; if () { *P = v2; }
 978 /// into a phi node with a store in the successor.
 979 ///
 980 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
 981   BasicBlock *StoreBB = SI.getParent();
 982
 983   // Check to see if the successor block has exactly two incoming edges.  If
 984   // so, see if the other predecessor contains a store to the same location.
 985   // if so, insert a PHI node (if needed) and move the stores down.
 986   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
 987
 988   // Determine whether Dest has exactly two predecessors and, if so, compute
 989   // the other predecessor.
 990   pred_iterator PI = pred_begin(DestBB);
 991   BasicBlock *P = *PI;
 992   BasicBlock *OtherBB = 0;
 993
 994   if (P != StoreBB)
 995     OtherBB = P;
 996
 997   if (++PI == pred_end(DestBB))
 998     return false;
 999
1000   P = *PI;
1001   if (P != StoreBB) {
1002     if (OtherBB)
1003       return false;
1004     OtherBB = P;
1005   }
1006   if (++PI != pred_end(DestBB))
1007     return false;
1008
1009   // Bail out if all the relevant blocks aren't distinct (this can happen,
1010   // for example, if SI is in an infinite loop)
1011   if (StoreBB == DestBB || OtherBB == DestBB)
1012     return false;
1013
1014   // Verify that the other block ends in a branch and is not otherwise empty.
1015   BasicBlock::iterator BBI = OtherBB->getTerminator();
1016   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
1017   if (!OtherBr || BBI == OtherBB->begin())
1018     return false;
1019
1020   // If the other block ends in an unconditional branch, check for the 'if then
1021   // else' case.  there is an instruction before the branch.
1022   StoreInst *OtherStore = 0;
1023   if (OtherBr->isUnconditional()) {
1024     --BBI;
1025     // Skip over debugging info.
1026     while (isa<DbgInfoIntrinsic>(BBI) ||
1027            (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1028       if (BBI==OtherBB->begin())
1029         return false;
1030       --BBI;
1031     }
1032     // If this isn't a store, isn't a store to the same location, or is not the
1033     // right kind of store, bail out.
1034     OtherStore = dyn_cast<StoreInst>(BBI);
1035     if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
1036         !SI.isSameOperationAs(OtherStore))
1037       return false;
1038   } else {
1039     // Otherwise, the other block ended with a conditional branch. If one of the
1040     // destinations is StoreBB, then we have the if/then case.
1041     if (OtherBr->getSuccessor(0) != StoreBB &&
1042         OtherBr->getSuccessor(1) != StoreBB)
1043       return false;
1044
1045     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1046     // if/then triangle.  See if there is a store to the same ptr as SI that
1047     // lives in OtherBB.
1048     for (;; --BBI) {
1049       // Check to see if we find the matching store.
1050       if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
1051         if (OtherStore->getOperand(1) != SI.getOperand(1) ||
1052             !SI.isSameOperationAs(OtherStore))
1053           return false;
1054         break;
1055       }
1056       // If we find something that may be using or overwriting the stored
1057       // value, or if we run out of instructions, we can't do the xform.
1058       if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
1059           BBI == OtherBB->begin())
1060         return false;
1061     }
1062
1063     // In order to eliminate the store in OtherBr, we have to
1064     // make sure nothing reads or overwrites the stored value in
1065     // StoreBB.
1066     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1067       // FIXME: This should really be AA driven.
1068       if (I->mayReadFromMemory() || I->mayWriteToMemory())
1069         return false;
1070     }
1071   }
1072
1073   // Insert a PHI node now if we need it.
1074   Value *MergedVal = OtherStore->getOperand(0);
1075   if (MergedVal != SI.getOperand(0)) {
1076     PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
1077     PN->addIncoming(SI.getOperand(0), SI.getParent());
1078     PN->addIncoming(OtherStore->getOperand(0), OtherBB);
1079     MergedVal = InsertNewInstBefore(PN, DestBB->front());
1080   }
1081
1082   // Advance to a place where it is safe to insert the new store and
1083   // insert it.
1084   BBI = DestBB->getFirstInsertionPt();
1085   StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
1086                                    SI.isVolatile(),
1087                                    SI.getAlignment(),
1088                                    SI.getOrdering(),
1089                                    SI.getSynchScope());
1090   InsertNewInstBefore(NewSI, *BBI);
1091   NewSI->setDebugLoc(OtherStore->getDebugLoc());
1092
1093   // If the two stores had the same TBAA tag, preserve it.
1094   if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
1095     if ((TBAATag = MDNode::getMostGenericTBAA(TBAATag,
1096                                OtherStore->getMetadata(LLVMContext::MD_tbaa))))
1097       NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
1098
1099
1100   // Nuke the old stores.
1101   EraseInstFromFunction(SI);
1102   EraseInstFromFunction(*OtherStore);
1103   return true;
1104 }