From 6a0d02e088584bb29783924347a5de31aeb06f3d Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 15 Dec 2015 20:55:55 +0000 Subject: [PATCH] AMDGPU/SI: Select constant loads with non-uniform addresses to MUBUF instructions Summary: We were previously selecting all constant loads to SMRD instructions and legalizing the SMRDs with non-uniform addresses during the SIFixSGPRCopesPass. This new solution is more simple and also generates much better code, because the instruction selector is able to take advantage of all the MUBUF addressing modes that are legalization pass wasn't able to. We also no longer need to generate v_add_* instructions when we have a uniform pointer and a non-uniform offset, as this is now folded into the MUBUF instruction during instruction selection. Reviewers: arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15425 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255672 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPU.h | 3 + .../AMDGPU/AMDGPUAnnotateUniformValues.cpp | 84 +++++++++++++++++++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 + lib/Target/AMDGPU/CMakeLists.txt | 1 + lib/Target/AMDGPU/SIISelLowering.cpp | 23 +++++ lib/Target/AMDGPU/SIISelLowering.h | 1 + lib/Target/AMDGPU/SIInstrInfo.td | 10 +++ lib/Target/AMDGPU/SIInstructions.td | 19 +++-- test/CodeGen/AMDGPU/salu-to-valu.ll | 77 +++++++++-------- 9 files changed, 178 insertions(+), 43 deletions(-) create mode 100644 lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index fc0530dc4f2..8c3cb567fc7 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -71,6 +71,7 @@ Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); +FunctionPass *createAMDGPUAnnotateUniformValues(); void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; @@ -78,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID; void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; +void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); +extern char &AMDGPUAnnotateUniformValuesPassID; extern Target TheAMDGPUTarget; extern Target TheGCNTarget; diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp new file mode 100644 index 00000000000..dfddc345f28 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -0,0 +1,84 @@ +//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass adds amdgpu.uniform metadata to IR values so this information +/// can be used during instruction selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-annotate-uniform" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateUniformValues : public FunctionPass, + public InstVisitor { + DivergenceAnalysis *DA; + +public: + static char ID; + AMDGPUAnnotateUniformValues() : + FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + } + + void visitLoadInst(LoadInst &I); + +}; + +} // End anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) + +char AMDGPUAnnotateUniformValues::ID = 0; + +void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + if (!DA->isUniform(Ptr)) + return; + + if (Instruction *PtrI = dyn_cast(Ptr)) + PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + +} + +bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + return false; +} + +bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + DA = &getAnalysis(); + visit(F); + + return true; +} + +FunctionPass * +llvm::createAMDGPUAnnotateUniformValues() { + return new AMDGPUAnnotateUniformValues(); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a22933ecf8a..22f85b3e663 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -51,6 +51,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); + initializeAMDGPUAnnotateUniformValuesPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -279,6 +280,8 @@ bool GCNPassConfig::addPreISel() { addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createSIAnnotateControlFlowPass()); + addPass(createAMDGPUAnnotateUniformValues()); + return false; } diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 3a51a5f5e10..30bb0e0adde 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp + AMDGPUAnnotateUniformValues.cpp AMDGPUAsmPrinter.cpp AMDGPUDiagnosticInfoUnsupported.cpp AMDGPUFrameLowering.cpp diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 71864de6957..0e043cb47da 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -504,6 +504,21 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa(Ptr) || isa(Ptr) || isa(Ptr) || + isa(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -1328,6 +1343,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { switch (Load->getAddressSpace()) { default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::PRIVATE_ADDRESS: if (NumElements >= 8) diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 4079be65d28..e2f8cb19d6b 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -80,6 +80,7 @@ public: bool MemcpyStrSrc, MachineFunction &MF) const override; + bool isMemOpUniform(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c57d0c07aab..e722e0851a5 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -137,6 +137,16 @@ def SIconstdata_ptr : SDNode< SDTCisVT<0, i64>]> >; +def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(cast(N)) || + isConstantLoad(cast(N), -1); +}]>; + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(cast(N), -1) && + static_cast(getTargetLowering())->isMemOpUniform(N); +}]>; + //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 8163419cd48..6f653c70aca 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -953,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < @@ -2087,24 +2087,29 @@ multiclass SMRD_Pattern { // 1. IMM offset def : Pat < - (constant_load (SMRDImm i64:$sbase, i32:$offset)), + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_IMM") $sbase, $offset)) >; // 2. SGPR offset def : Pat < - (constant_load (SMRDSgpr i64:$sbase, i32:$offset)), + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_SGPR") $sbase, $offset)) >; def : Pat < - (constant_load (SMRDImm32 i64:$sbase, i32:$offset)), + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_IMM_ci") $sbase, $offset)) > { let Predicates = [isCIOnly]; } } +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { + defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; @@ -2133,6 +2138,8 @@ def : Pat < } // End Predicates = [isCI] +} // End let AddedComplexity = 10000 + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index e40732ba3af..a30c25e700a 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -77,7 +77,8 @@ endif: ; preds = %else, %if ; Test moving an SMRD with an immediate offset to the VALU ; GCN-LABEL: {{^}}smrd_valu2: -; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -90,8 +91,10 @@ entry: ; Use a big offset that will use the SMRD literal offset on CI ; GCN-LABEL: {{^}}smrd_valu_ci_offset: -; GCN: s_movk_i32 s[[OFFSET:[0-9]+]], 0x4e20{{$}} -; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-NOT: v_add +; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} ; GCN: v_add_i32_e32 ; GCN: buffer_store_dword define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { @@ -106,8 +109,10 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2: -; GCN: s_mov_b32 s[[OFFSET:[0-9]+]], 0x9c40{{$}} -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: buffer_store_dwordx2 @@ -123,8 +128,10 @@ entry: } ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4: -; GCN: s_movk_i32 s[[OFFSET:[0-9]+]], 0x4d20{{$}} -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-NOT: v_add +; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -145,14 +152,14 @@ entry: ; CI. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8: -; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x9a40{{$}} -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}} - -; SI: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}} - -; CI: s_mov_b32 s[[OFFSET1:[0-9]+]], 0x9a50{{$}} -; CI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -175,22 +182,24 @@ entry: ret void } -; FIXME: should use immediate offset instead of using s_add_i32 for adding to constant. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: -; GCN-DAG: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}} -; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16 -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}} - -; CI-DAG: s_mov_b32 s[[OFFSET1:[0-9]+]], 0x13490{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}} - -; SI-DAG: s_add_i32 s[[OFFSET2:[0-9]+]], s[[OFFSET0]], 32 -; CI-DAG: s_mov_b32 s[[OFFSET2:[0-9]+]], 0x134a0 - -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}} -; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16 -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}} +; GCN-NOT: v_add +; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -244,15 +253,9 @@ entry: ret void } -; Offset is too big to fit in SMRD 8-bit offset, but small enough to -; fit in MUBUF offset. -; FIXME: We should be using the offset but we don't - ; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset: -; SI: s_movk_i32 s[[OFFSET:[0-9]+]], 0x400{{$}} -; SI: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+\]}}, 0 addr64{{$}} - -; CI: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} +; GCN-NOT: v_add +; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 -- 2.34.1