From: Robin Morisset Date: Tue, 23 Sep 2014 20:46:49 +0000 (+0000) Subject: [Power] Use AtomicExpandPass for fence insertion, and use lwsync where appropriate X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=58bca6e8ec2ab4a2ba1b1981401a76f54a84e6b2 [Power] Use AtomicExpandPass for fence insertion, and use lwsync where appropriate Summary: This patch makes use of AtomicExpandPass in Power for inserting fences around atomic as part of an effort to remove fence insertion from SelectionDAGBuilder. As a big bonus, it lets us use sync 1 (lightweight sync, often used by the mnemonic lwsync) instead of sync 0 (heavyweight sync) in many cases. I also added a test, as there was no test for the barriers emitted by the Power backend for atomic loads and stores. Test Plan: new test + make check-all Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5180 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218331 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td index 49ddfb8b613..645e2844d23 100644 --- a/include/llvm/IR/IntrinsicsPowerPC.td +++ b/include/llvm/IR/IntrinsicsPowerPC.td @@ -28,8 +28,10 @@ let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". def int_ppc_dcbz : Intrinsic<[], [llvm_ptr_ty], []>; def int_ppc_dcbzl : Intrinsic<[], [llvm_ptr_ty], []>; - // sync instruction + // sync instruction (i.e. sync 0, a.k.a hwsync) def int_ppc_sync : Intrinsic<[], [], []>; + // lwsync is sync 1 + def int_ppc_lwsync : Intrinsic<[], [], []>; // Intrinsics used to generate ctr-based loops. These should only be // generated by the PowerPC backend! diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index d96cdab604f..e216a72c495 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -6538,6 +6538,38 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, // Other Lowering Code //===----------------------------------------------------------------------===// +static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *Func = Intrinsic::getDeclaration(M, Id); + return Builder.CreateCall(Func); +} + +// The mappings for emitLeading/TrailingFence is taken from +// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (Ord == SequentiallyConsistent) + return callIntrinsic(Builder, Intrinsic::ppc_sync); + else if (isAtLeastRelease(Ord)) + return callIntrinsic(Builder, Intrinsic::ppc_lwsync); + else + return nullptr; +} + +Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (IsLoad && isAtLeastAcquire(Ord)) + return callIntrinsic(Builder, Intrinsic::ppc_lwsync); + // FIXME: this is too conservative, a dependent branch + isync is enough. + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + else + return nullptr; +} + MachineBasicBlock * PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, bool is64bit, unsigned BinOpcode) const { diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 5628bc79342..7444d41bbc4 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -413,6 +413,11 @@ namespace llvm { const SelectionDAG &DAG, unsigned Depth = 0) const override; + Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 38020efb58b..5643ed78a70 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -1704,8 +1704,13 @@ let isCodeGenOnly = 1 in { } } -def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[IsNotBookE]>; -def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[IsBookE]>; +// FIXME: We should use a specific flag to check for the presence of the sync +// instruction instead of relying on IsBookE. +// For example, the PPC A2 supports SYNC despite being a BookE processor. +def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[IsNotBookE]>; +def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[IsNotBookE]>; +def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[IsBookE]>; +def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[IsBookE]>; //===----------------------------------------------------------------------===// // PPC32 Arithmetic Instructions. diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 878d1364609..ed90e27237c 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -86,6 +86,7 @@ public: return *getPPCTargetMachine().getSubtargetImpl(); } + void addIRPasses() override; bool addPreISel() override; bool addILPOpts() override; bool addInstSelector() override; @@ -99,6 +100,11 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { return new PPCPassConfig(this, PM); } +void PPCPassConfig::addIRPasses() { + addPass(createAtomicExpandPass(&getPPCTargetMachine())); + TargetPassConfig::addIRPasses(); +} + bool PPCPassConfig::addPreISel() { if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) addPass(createPPCCTRLoops(getPPCTargetMachine())); diff --git a/test/CodeGen/PowerPC/atomics.ll b/test/CodeGen/PowerPC/atomics.ll new file mode 100644 index 00000000000..4c4ccb4df83 --- /dev/null +++ b/test/CodeGen/PowerPC/atomics.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32 +; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction). +; This is already checked for in Atomics-64.ll +; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64 + +; FIXME: we don't currently check for the operations themselves with CHECK-NEXT, +; because they are implemented in a very messy way with lwarx/stwcx. +; It should be fixed soon in another patch. + +; We first check loads, for all sizes from i8 to i64. +; We also vary orderings to check for barriers. +define i8 @load_i8_unordered(i8* %mem) { +; CHECK-LABEL: load_i8_unordered +; CHECK-NOT: sync + %val = load atomic i8* %mem unordered, align 1 + ret i8 %val +} +define i16 @load_i16_monotonic(i16* %mem) { +; CHECK-LABEL: load_i16_monotonic +; CHECK-NOT: sync + %val = load atomic i16* %mem monotonic, align 2 + ret i16 %val +} +define i32 @load_i32_acquire(i32* %mem) { +; CHECK-LABEL: load_i32_acquire + %val = load atomic i32* %mem acquire, align 4 +; CHECK: sync 1 + ret i32 %val +} +define i64 @load_i64_seq_cst(i64* %mem) { +; CHECK-LABEL: load_i64_seq_cst +; CHECK: sync 0 + %val = load atomic i64* %mem seq_cst, align 8 +; CHECK: sync 1 + ret i64 %val +} + +; Stores +define void @store_i8_unordered(i8* %mem) { +; CHECK-LABEL: store_i8_unordered +; CHECK-NOT: sync + store atomic i8 42, i8* %mem unordered, align 1 + ret void +} +define void @store_i16_monotonic(i16* %mem) { +; CHECK-LABEL: store_i16_monotonic +; CHECK-NOT: sync + store atomic i16 42, i16* %mem monotonic, align 2 + ret void +} +define void @store_i32_release(i32* %mem) { +; CHECK-LABEL: store_i32_release +; CHECK: sync 1 + store atomic i32 42, i32* %mem release, align 4 + ret void +} +define void @store_i64_seq_cst(i64* %mem) { +; CHECK-LABEL: store_i64_seq_cst +; CHECK: sync 0 + store atomic i64 42, i64* %mem seq_cst, align 8 + ret void +} + +; Atomic CmpXchg +define i8 @cas_strong_i8_sc_sc(i8* %mem) { +; CHECK-LABEL: cas_strong_i8_sc_sc +; CHECK: sync 0 + %val = cmpxchg i8* %mem, i8 0, i8 1 seq_cst seq_cst +; CHECK: sync 1 + %loaded = extractvalue { i8, i1} %val, 0 + ret i8 %loaded +} +define i16 @cas_weak_i16_acquire_acquire(i16* %mem) { +; CHECK-LABEL: cas_weak_i16_acquire_acquire +;CHECK-NOT: sync + %val = cmpxchg weak i16* %mem, i16 0, i16 1 acquire acquire +; CHECK: sync 1 + %loaded = extractvalue { i16, i1} %val, 0 + ret i16 %loaded +} +define i32 @cas_strong_i32_acqrel_acquire(i32* %mem) { +; CHECK-LABEL: cas_strong_i32_acqrel_acquire +; CHECK: sync 1 + %val = cmpxchg i32* %mem, i32 0, i32 1 acq_rel acquire +; CHECK: sync 1 + %loaded = extractvalue { i32, i1} %val, 0 + ret i32 %loaded +} +define i64 @cas_weak_i64_release_monotonic(i64* %mem) { +; CHECK-LABEL: cas_weak_i64_release_monotonic +; CHECK: sync 1 + %val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic +; CHECK-NOT: [sync ] + %loaded = extractvalue { i64, i1} %val, 0 + ret i64 %loaded +} + +; AtomicRMW +define i8 @add_i8_monotonic(i8* %mem, i8 %operand) { +; CHECK-LABEL: add_i8_monotonic +; CHECK-NOT: sync + %val = atomicrmw add i8* %mem, i8 %operand monotonic + ret i8 %val +} +define i16 @xor_i16_seq_cst(i16* %mem, i16 %operand) { +; CHECK-LABEL: xor_i16_seq_cst +; CHECK: sync 0 + %val = atomicrmw xor i16* %mem, i16 %operand seq_cst +; CHECK: sync 1 + ret i16 %val +} +define i32 @xchg_i32_acq_rel(i32* %mem, i32 %operand) { +; CHECK-LABEL: xchg_i32_acq_rel +; CHECK: sync 1 + %val = atomicrmw xchg i32* %mem, i32 %operand acq_rel +; CHECK: sync 1 + ret i32 %val +} +define i64 @and_i64_release(i64* %mem, i64 %operand) { +; CHECK-LABEL: and_i64_release +; CHECK: sync 1 + %val = atomicrmw and i64* %mem, i64 %operand release +; CHECK-NOT: [sync ] + ret i64 %val +}