From c26292d4dcc3f11279eb6bc524344d926bf3465e Mon Sep 17 00:00:00 2001 From: Richard Osborne Date: Thu, 27 Feb 2014 13:39:07 +0000 Subject: [PATCH] [XCore] Target optimized library function __memcpy_4() Summary: If the src, dst and size of a memcpy are known to be 4 byte aligned we can call __memcpy_4() instead of memcpy(). Reviewers: robertlytton Reviewed By: robertlytton CC: llvm-commits Differential Revision: http://llvm-reviews.chandlerc.com/D2871 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202395 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/XCore/XCoreSelectionDAGInfo.cpp | 33 ++++++++++++++++++++++ lib/Target/XCore/XCoreSelectionDAGInfo.h | 9 ++++++ test/CodeGen/XCore/byVal.ll | 2 +- test/CodeGen/XCore/memcpy.ll | 32 +++++++++++++++++++++ 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/XCore/memcpy.ll diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp index 44aeb6057cc..68ede6ae6d9 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp @@ -21,3 +21,36 @@ XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM) XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() { } + +SDValue XCoreSelectionDAGInfo:: +EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, + SDValue Dst, SDValue Src, SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const +{ + unsigned SizeBitWidth = Size.getValueType().getSizeInBits(); + // Call __memcpy_4 if the src, dst and size are all 4 byte aligned. + if (!AlwaysInline && (Align & 3) == 0 && + DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) { + const TargetLowering &TLI = *DAG.getTarget().getTargetLowering(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; Args.push_back(Entry); + Entry.Node = Src; Args.push_back(Entry); + Entry.Node = Size; Args.push_back(Entry); + + TargetLowering::CallLoweringInfo + CLI(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false, + 0, TLI.getLibcallCallingConv(RTLIB::MEMCPY), /*isTailCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed=*/false, + DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), Args, DAG, dl); + std::pair CallResult = + TLI.LowerCallTo(CLI); + return CallResult.second; + } + + // Otherwise have the target-independent code call memcpy. + return SDValue(); +} diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h index 0386968638b..31704f388a9 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.h +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h @@ -24,6 +24,15 @@ class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo { public: explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM); ~XCoreSelectionDAGInfo(); + + virtual SDValue + EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, bool isVolatile, + bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const; }; } diff --git a/test/CodeGen/XCore/byVal.ll b/test/CodeGen/XCore/byVal.ll index e9612fd6021..df6c6d351d1 100644 --- a/test/CodeGen/XCore/byVal.ll +++ b/test/CodeGen/XCore/byVal.ll @@ -20,7 +20,7 @@ entry: ; CHECK: ldaw r5, sp[1] ; CHECK: ldc r2, 40 ; CHECK: mov r0, r5 -; CHECK: bl memcpy +; CHECK: bl __memcpy_4 ; CHECK: mov r0, r5 ; CHECK: bl f1 ; CHECK: mov r0, r4 diff --git a/test/CodeGen/XCore/memcpy.ll b/test/CodeGen/XCore/memcpy.ll new file mode 100644 index 00000000000..fe424c50cb2 --- /dev/null +++ b/test/CodeGen/XCore/memcpy.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=xcore | FileCheck %s + +; Optimize memcpy to __memcpy_4 if src, dst and size are all 4 byte aligned. +define void @f1(i8* %dst, i8* %src, i32 %n) nounwind { +; CHECK-LABEL: f1: +; CHECK: bl __memcpy_4 +entry: + %0 = shl i32 %n, 2 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %0, i32 4, i1 false) + ret void +} + +; Can't optimize - size is not a multiple of 4. +define void @f2(i8* %dst, i8* %src, i32 %n) nounwind { +; CHECK-LABEL: f2: +; CHECK: bl memcpy +entry: + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %n, i32 4, i1 false) + ret void +} + +; Can't optimize - alignment is not a multiple of 4. +define void @f3(i8* %dst, i8* %src, i32 %n) nounwind { +; CHECK-LABEL: f3: +; CHECK: bl memcpy +entry: + %0 = shl i32 %n, 2 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %0, i32 2, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind -- 2.34.1