From 8b3a9205b717fb77e7ffed6ed86db671228d9c4a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 15 Oct 2014 18:06:43 +0000 Subject: [PATCH] R600/SI: Also try to use 0 base for misaligned 8-byte DS loads. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@219823 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 17 +++++++++++++++ test/CodeGen/R600/ds_read2.ll | 30 ++++++++++++++++++++++++++ test/CodeGen/R600/ds_write2.ll | 26 ++++++++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index a1bbfa65530..becb7112217 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -823,6 +823,23 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, } } + if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { + unsigned DWordOffset0 = CAddr->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + assert(4 * DWordOffset0 == CAddr->getZExtValue()); + + if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + MachineSDNode *MovZero + = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(Addr), MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + return true; + } + } + // default case Base = Addr; Offset0 = CurDAG->getTargetConstant(0, MVT::i8); diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll index 74d3a598b96..388d21ba08c 100644 --- a/test/CodeGen/R600/ds_read2.ll +++ b/test/CodeGen/R600/ds_read2.ll @@ -406,6 +406,36 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { ret void } +@bar = addrspace(3) global [4 x i64] zeroinitializer, align 4 + +; SI-LABEL: @load_misaligned64_constant_offsets +; SI: V_MOV_B32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: DS_READ2_B32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1 +; SI: DS_READ2_B32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { + %val0 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 + %val1 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 + %sum = add i64 %val0, %val1 + store i64 %sum, i64 addrspace(1)* %out, align 8 + ret void +} + +@bar.large = addrspace(3) global [4096 x i64] zeroinitializer, align 4 + +; SI-LABEL: @load_misaligned64_constant_large_offsets +; SI-DAG: V_MOV_B32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; SI-DAG: V_MOV_B32_e32 [[BASE1:v[0-9]+]], 0x4000 +; SI-DAG: DS_READ2_B32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset0:0 offset1:1 +; SI-DAG: DS_READ2_B32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset0:0 offset1:1 +; SI: S_ENDPGM +define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { + %val0 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 + %val1 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 + %sum = add i64 %val0, %val1 + store i64 %sum, i64 addrspace(1)* %out, align 8 + ret void +} + @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] zeroinitializer, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] zeroinitializer, align 4 diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll index 6e5bcffb621..99876f9ce07 100644 --- a/test/CodeGen/R600/ds_write2.ll +++ b/test/CodeGen/R600/ds_write2.ll @@ -341,6 +341,32 @@ define void @store_constant_disjoint_offsets() { ret void } +@bar = addrspace(3) global [4 x i64] zeroinitializer, align 4 + +; SI-LABEL: @store_misaligned64_constant_offsets +; SI: V_MOV_B32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: DS_WRITE2_B32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1 +; SI: DS_WRITE2_B32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +define void @store_misaligned64_constant_offsets() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 + ret void +} + +@bar.large = addrspace(3) global [4096 x i64] zeroinitializer, align 4 + +; SI-LABEL: @store_misaligned64_constant_large_offsets +; SI-DAG: V_MOV_B32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; SI-DAG: V_MOV_B32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} +; SI-DAG: DS_WRITE2_B32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1 +; SI-DAG: DS_WRITE2_B32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1 +; SI: S_ENDPGM +define void @store_misaligned64_constant_large_offsets() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 + ret void +} + @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] zeroinitializer, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] zeroinitializer, align 4 -- 2.34.1