From: Tom Stellard Date: Fri, 16 Aug 2013 01:12:06 +0000 (+0000) Subject: R600: Add support for i16 and i8 global stores X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=ec484277dd04399d7b2ea37508e39fc4998bc9a7;p=oota-llvm.git R600: Add support for i16 and i8 global stores Tested-by: Aaron Watry git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188519 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 1e799988987..7ceab2df0ef 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -558,5 +558,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SAMPLEB) NODE_NAME_CASE(SAMPLED) NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(STORE_MSKOR) } } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 9adbb543d34..8788c20903d 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -150,6 +150,7 @@ enum { SAMPLED, SAMPLEL, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + STORE_MSKOR, LOAD_CONSTANT, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 48d89dd5819..c61993a8cda 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -72,3 +72,7 @@ def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayStore]>; + +def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", + SDTypeProfile<0, 2, []>, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index ddb655add83..df0bade2790 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -146,6 +146,16 @@ def az_extloadi32_constant : PatFrag<(ops node:$ptr), return isConstantLoad(dyn_cast(N), -1); }]>; +def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast(N)); +}]>; + +def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast(N)); +}]>; + def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast(N)); }]>; @@ -155,6 +165,11 @@ def local_store : PatFrag<(ops node:$val, node:$ptr), return isLocalStore(dyn_cast(N)); }]>; +def mskor_global : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return dyn_cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +}]>; + class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a89875c99a7..b6b65609c1e 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -84,6 +84,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); @@ -1009,19 +1011,54 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Value = Op.getOperand(1); SDValue Ptr = Op.getOperand(2); - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { - // Convert pointer from byte address to dword address. - Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), - DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(2, MVT::i32))); - - if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { - assert(!"Truncated and indexed stores not supported yet"); - } else { - Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + if (StoreNode->isTruncatingStore()) { + EVT VT = Value.getValueType(); + assert(VT == MVT::i32); + EVT MemVT = StoreNode->getMemoryVT(); + SDValue MaskConstant; + if (MemVT == MVT::i8) { + MaskConstant = DAG.getConstant(0xFF, MVT::i32); + } else { + assert(MemVT == MVT::i16); + MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); + } + SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, + DAG.getConstant(2, MVT::i32)); + SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(0x00000003, VT)); + SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, + DAG.getConstant(3, VT)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); + SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); + // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 + // vector instead. + SDValue Src[4] = { + ShiftedValue, + DAG.getConstant(0, MVT::i32), + DAG.getConstant(0, MVT::i32), + Mask + }; + SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4); + SDValue Args[3] = { Chain, Input, DWordAddr }; + return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, + Op->getVTList(), Args, 3, MemVT, + StoreNode->getMemOperand()); + } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && + Value.getValueType().bitsGE(MVT::i32)) { + // Convert pointer from byte address to dword address. + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), + DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), + Ptr, DAG.getConstant(2, MVT::i32))); + + if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { + assert(!"Truncated and indexed stores not supported yet"); + } else { + Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); + } + return Chain; } - return Chain; } EVT ValueVT = Value.getValueType(); diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 06886ce0560..b059a8179b7 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1274,6 +1274,19 @@ class CF_MEM_RAT_CACHELESS rat_inst, bits<4> rat_id, bits<4> mask, dag : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, "MEM_RAT_CACHELESS "#name, pattern>; +class CF_MEM_RAT rat_inst, bits<4> rat_id, dag ins, string name, + list pattern> + : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, + "MEM_RAT "#name, pattern>; + +def RAT_MSKOR : CF_MEM_RAT <0x11, 0, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + "MSKOR $rw_gpr.XW, $index_gpr", + [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] +> { + let eop = 0; +} + } // End Predicates = [isEGorCayman] diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 26394562231..ecc471817e4 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -400,9 +400,9 @@ multiclass MUBUF_Load_Helper op, string asm, RegisterClass regClass> { } } -class MUBUF_Store_Helper op, string name, RegisterClass vdataClass, - ValueType VT> : - MUBUF op, string name, RegisterClass vdataClass> : + MUBUF { diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index e719cb32706..4eb3566c011 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -409,19 +409,25 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", V defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; -//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; -//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; + +def BUFFER_STORE_BYTE : MUBUF_Store_Helper < + 0x00000018, "BUFFER_STORE_BYTE", VReg_32 +>; + +def BUFFER_STORE_SHORT : MUBUF_Store_Helper < + 0x0000001a, "BUFFER_STORE_SHORT", VReg_32 +>; def BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32 + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32 >; def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64 + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64 >; def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32 + 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128 >; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; @@ -1826,23 +1832,25 @@ defm : MUBUFLoad_Pattern ; -multiclass MUBUFStore_Pattern { +multiclass MUBUFStore_Pattern { def : Pat < - (global_store vt:$value, i64:$ptr), + (st vt:$value, i64:$ptr), (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0) >; def : Pat < - (global_store vt:$value, (add i64:$ptr, i64:$offset)), + (st vt:$value, (add i64:$ptr, i64:$offset)), (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0) >; } -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; /********** ====================== **********/ /********** Indirect adressing **********/ diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll index 0b0e210d5a6..fe53d601d39 100644 --- a/test/CodeGen/R600/si-vector-hang.ll +++ b/test/CodeGen/R600/si-vector-hang.ll @@ -1,7 +1,5 @@ ; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s -; XXX: Mark this test as XFAIL until buffer stores are implemented -; XFAIL: * ; CHECK: @test_8_min_char ; CHECK: BUFFER_STORE_BYTE ; CHECK: BUFFER_STORE_BYTE diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll index f2a8dd7d265..cba01a31c52 100644 --- a/test/CodeGen/R600/store.ll +++ b/test/CodeGen/R600/store.ll @@ -2,6 +2,67 @@ ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s ; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s +;===------------------------------------------------------------------------===; +; Global Address Space +;===------------------------------------------------------------------------===; + +; i8 store +; EG-CHECK: @store_i8 +; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X +; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]] +; IG 0: Get the byte index +; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG-CHECK-NEXT: 3 +; IG 1: Truncate the value and calculated the shift amount for the mask +; EG-CHECK: AND_INT T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.x +; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.y +; EG-CHECK: 255(3.573311e-43), 3 +; IG 2: Shift the value and the mask +; EG-CHECK: LSHL T[[RW_GPR]].X, PV.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]] +; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] +; EG-CHECK-NEXT: 255 +; IG 3: Initialize the Y and Z channels to zero +; XXX: An optimal scheduler should merge this into one of the prevous IGs. +; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0 +; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0 + +; SI-CHECK: @store_i8 +; SI-CHECK: BUFFER_STORE_BYTE + +define void @store_i8(i8 addrspace(1)* %out, i8 %in) { +entry: + store i8 %in, i8 addrspace(1)* %out + ret void +} + +; i16 store +; EG-CHECK: @store_i16 +; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X +; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]] +; IG 0: Get the byte index +; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG-CHECK-NEXT: 3 +; IG 1: Truncate the value and calculated the shift amount for the mask +; EG-CHECK: AND_INT T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.x +; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.y +; EG-CHECK: 65535(9.183409e-41), 3 +; IG 2: Shift the value and the mask +; EG-CHECK: LSHL T[[RW_GPR]].X, PV.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]] +; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] +; EG-CHECK-NEXT: 65535 +; IG 3: Initialize the Y and Z channels to zero +; XXX: An optimal scheduler should merge this into one of the prevous IGs. +; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0 +; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0 + +; SI-CHECK: @store_i16 +; SI-CHECK: BUFFER_STORE_SHORT +define void @store_i16(i16 addrspace(1)* %out, i16 %in) { +entry: + store i16 %in, i16 addrspace(1)* %out + ret void +} + ; floating-point store ; EG-CHECK: @store_f32 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1