#include "llvm/Function.h"
#include "llvm/IntrinsicInst.h"
#include "llvm/Instructions.h"
+#include "llvm/ParameterAttributes.h"
#include "llvm/Value.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
bool processNonLocalLoad(LoadInst* L,
SmallVector<Instruction*, 4>& toErase);
bool processMemCpy(MemCpyInst* M, SmallVector<Instruction*, 4>& toErase);
+ bool performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+ SmallVector<Instruction*, 4>& toErase);
Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
DenseMap<BasicBlock*, Value*> &Phis,
bool top_level = false);
return deletedLoad;
}
+/// performReturnSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a return slot optimization by having
+/// the call write its result directly into the callees return parameter
+/// rather than using memcpy
+bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+ SmallVector<Instruction*, 4>& toErase) {
+ // Check that we're copying to an argument...
+ Value* cpyDest = cpy->getDest();
+ if (!isa<Argument>(cpyDest))
+ return false;
+
+ // And that the argument is the return slot
+ Argument* sretArg = cast<Argument>(cpyDest);
+ if (!sretArg->hasStructRetAttr())
+ return false;
+
+ // Make sure the return slot is otherwise dead
+ std::set<User*> useList(sretArg->use_begin(), sretArg->use_end());
+ while (!useList.empty()) {
+ User* UI = *useList.begin();
+
+ if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+ useList.insert(UI->use_begin(), UI->use_end());
+ useList.erase(UI);
+ } else if (UI == cpy)
+ useList.erase(UI);
+ else
+ return false;
+ }
+
+ // Make sure the call cannot modify the return slot in some unpredicted way
+ AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+ if (AA.getModRefInfo(C, cpy->getRawDest(), ~0UL) != AliasAnalysis::NoModRef)
+ return false;
+
+ // If all checks passed, then we can perform the transformation
+ CallSite CS = CallSite::get(C);
+ for (unsigned i = 0; i < CS.arg_size(); ++i) {
+ if (CS.paramHasAttr(i+1, ParamAttr::StructRet)) {
+ if (CS.getArgument(i)->getType() != cpyDest->getType())
+ return false;
+
+ CS.setArgument(i, cpyDest);
+ break;
+ }
+ }
+
+ MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+ MD.dropInstruction(C);
+
+ // Remove the memcpy
+ toErase.push_back(cpy);
+
+ return true;
+}
+
/// processMemCpy - perform simplication of memcpy's. If we have memcpy A which
/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
// First, we have to check that the dependency is another memcpy
Instruction* dep = MD.getDependency(M);
if (dep == MemoryDependenceAnalysis::None ||
- dep == MemoryDependenceAnalysis::NonLocal ||
- !isa<MemCpyInst>(dep))
+ dep == MemoryDependenceAnalysis::NonLocal)
return false;
+ else if (!isa<MemCpyInst>(dep)) {
+ if (CallInst* C = dyn_cast<CallInst>(dep))
+ return performReturnSlotOptzn(M, C, toErase);
+ else
+ return false;
+ }
// We can only transforms memcpy's where the dest of one is the source of the
// other
--- /dev/null
+; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep memcpy | count 1
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin9"
+
+define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval %z) nounwind {
+entry:
+ %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
+ %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
+ %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
+ %tmp2 = load x86_fp80* %tmp1, align 16 ; <x86_fp80> [#uses=1]
+ %tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2 ; <x86_fp80> [#uses=1]
+ %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
+ %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; <x86_fp80*> [#uses=1]
+ %tmp7 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; <x86_fp80*> [#uses=1]
+ %tmp8 = load x86_fp80* %tmp7, align 16 ; <x86_fp80> [#uses=1]
+ store x86_fp80 %tmp3, x86_fp80* %real, align 16
+ store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16
+ call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %memtmp, { x86_fp80, x86_fp80 }* byval %iz ) nounwind
+ %memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; <i8*> [#uses=1]
+ %agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; <i8*> [#uses=1]
+ call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )
+ ret void
+}
+
+declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval ) nounwind
+
+declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind