From b617c550dc99d74eb0dcde814ce9bfbcfc2bdda9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 25 Nov 2015 19:58:34 +0000 Subject: [PATCH] AMDGPU: Make v2i64/v2f64 legal types. They can be loaded and stored, so count them as legal. This is mostly to fix a number of common cases for load/store merging. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254086 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 44 ++++- lib/Target/AMDGPU/SIInstructions.td | 15 ++ lib/Target/AMDGPU/SIRegisterInfo.td | 4 +- test/CodeGen/AMDGPU/ds_read2_superreg.ll | 6 +- test/CodeGen/AMDGPU/ds_write2.ll | 5 +- test/CodeGen/AMDGPU/extract-vector-elt-i64.ll | 24 +++ test/CodeGen/AMDGPU/fadd64.ll | 10 +- test/CodeGen/AMDGPU/global-extload-i32.ll | 182 ++++++------------ test/CodeGen/AMDGPU/half.ll | 26 ++- test/CodeGen/AMDGPU/insert_vector_elt.ll | 103 +++++++--- test/CodeGen/AMDGPU/merge-stores.ll | 13 +- test/CodeGen/AMDGPU/reorder-stores.ll | 12 +- 12 files changed, 261 insertions(+), 183 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 4e2538eef31..4ed9cf6c97e 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -52,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -156,13 +159,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -174,9 +194,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -187,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: break; case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -198,6 +224,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, } } + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index f2055549dd4..98d74a217ac 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -2501,6 +2501,11 @@ def : Pat < /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ +//def : Extract_Element; +//def : Extract_Element; +//def : Extract_Element; +//def : Extract_Element; + foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast(sub#Index) @@ -2586,6 +2591,16 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; + + + + def : BitConvert ; def : BitConvert ; def : BitConvert ; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index e28dd2fdf91..227c4f535cb 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -193,7 +193,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, (add SGPR_64, VCC, EXEC, FLAT_SCR) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 32, (add SGPR_128)> { +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { // Requires 2 s_mov_b64 to copy let CopyCost = 2; } @@ -221,7 +221,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { let CopyCost = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 32, (add VGPR_128)> { +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { // Requires 4 v_mov_b32 to copy let CopyCost = 4; } diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 8d50960e4ab..8073426e9d8 100644 --- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -61,15 +61,11 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { ret void } - -; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy - ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} -; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index d4973e377b5..b408459e82c 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -345,8 +345,9 @@ define void @store_constant_disjoint_offsets() { ; SI-LABEL: @store_misaligned64_constant_offsets ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; SI: s_endpgm define void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll b/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll index 24301cc0cb9..e3255913962 100644 --- a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll +++ b/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll @@ -17,3 +17,27 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa store volatile i64 %val, i64 addrspace(1)* %in ret void } + + +define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) nounwind { + %p0 = extractelement <2 x i64> %foo, i32 0 + %p1 = extractelement <2 x i64> %foo, i32 1 + %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1 + store volatile i64 %p1, i64 addrspace(1)* %out + store volatile i64 %p0, i64 addrspace(1)* %out1 + ret void +} + +define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) nounwind { + %dynelt = extractelement <2 x i64> %foo, i32 %elt + store volatile i64 %dynelt, i64 addrspace(1)* %out + ret void +} + +define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) nounwind { + %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo + %or = or <2 x i64> %load, %arst + %dynelt = extractelement <2 x i64> %or, i32 %elt + store volatile i64 %dynelt, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll index 093f4fc5565..19c17289da3 100644 --- a/test/CodeGen/AMDGPU/fadd64.ll +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -23,8 +23,7 @@ define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { ; CHECK-LABEL: {{^}}v_fadd_v2f64: ; CHECK: v_add_f64 ; CHECK: v_add_f64 -; CHECK: buffer_store_dwordx2 -; CHECK: buffer_store_dwordx2 +; CHECK: buffer_store_dwordx4 define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 @@ -35,10 +34,9 @@ define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac } ; CHECK-LABEL: {{^}}s_fadd_v2f64: -; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -; CHECK: buffer_store_dwordx2 -; CHECK: buffer_store_dwordx2 +; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: buffer_store_dwordx4 define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) { %r2 = fadd <2 x double> %r0, %r1 store <2 x double> %r2, <2 x double> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll index ef2f64d673d..e5e6be2199c 100644 --- a/test/CodeGen/AMDGPU/global-extload-i32.ll +++ b/test/CodeGen/AMDGPU/global-extload-i32.ll @@ -49,8 +49,7 @@ define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i ; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: ; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i32>, <2 x i32> addrspace(1)* %in @@ -63,8 +62,7 @@ define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; SI: buffer_load_dwordx2 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <2 x i32>, <2 x i32> addrspace(1)* %in @@ -75,10 +73,8 @@ define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i ; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: ; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -93,10 +89,8 @@ define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <4 x i32>, <4 x i32> addrspace(1)* %in @@ -108,14 +102,10 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i ; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: ; SI: buffer_load_dwordx4 ; SI: buffer_load_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -136,15 +126,10 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -163,29 +148,25 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -200,23 +181,14 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; SI: buffer_load_dwordx4 ; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 - +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { %load = load <16 x i32>, <16 x i32> addrspace(1)* %in @@ -269,41 +241,25 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { @@ -323,41 +279,25 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 ; SI: buffer_load_dwordx4 ; SI: buffer_load_dwordx4 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 + +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 +; SI-DAG: buffer_store_dwordx4 ; SI: s_endpgm define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index a344d213d1f..a02cbf43c40 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -382,10 +382,9 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace ; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] ; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] -; GCN-DAG: v_cvt_f64_f32_e32 [[CVT2:v\[[0-9]+:[0-9]+\]]], v[[CVT0]] -; GCN-DAG: v_cvt_f64_f32_e32 [[CVT3:v\[[0-9]+:[0-9]+\]]], v[[CVT1]] -; GCN-DAG: buffer_store_dwordx2 [[CVT2]] -; GCN-DAG: buffer_store_dwordx2 [[CVT3]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} ; GCN: s_endpgm define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in @@ -395,6 +394,25 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x } ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: + +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32 +; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]] +; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16_e32 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN-NOT: v_cvt_f64_f32_e32 + +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN: s_endpgm define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in %cvt = fpext <3 x half> %val to <3 x double> diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 6de3d408c48..7f9579e5978 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -70,8 +70,9 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl } ; SI-LABEL: {{^}}dynamic_insertelement_v8f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 +; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 @@ -79,10 +80,11 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl } ; SI-LABEL: {{^}}dynamic_insertelement_v16f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 +; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 @@ -202,10 +204,28 @@ endif: } ; SI-LABEL: {{^}}dynamic_insertelement_v2f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}} +; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} +; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}} + +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} + +; SI: s_mov_b32 m0, [[SCALEDIDX]] +; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] + +; Increment to next element. +; FIXME: Should be able to manipulate m0 directly instead of add and +; copy. + +; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1 +; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 +; SI-DAG: s_mov_b32 m0, [[IDX1]] +; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] + +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { %vecins = insertelement <2 x double> %a, double 8.0, i32 %b @@ -213,9 +233,16 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d ret void } +; FIXME: Inline immediate should be folded into v_movreld_b32. ; SI-LABEL: {{^}}dynamic_insertelement_v2i64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 + +; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}} +; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}} + +; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] +; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] + +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { %vecins = insertelement <2 x i64> %a, i64 5, i32 %b @@ -223,12 +250,29 @@ define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> ret void } +; FIXME: Should be able to do without stack access. The used stack +; space is also 2x what should be required. + ; SI-LABEL: {{^}}dynamic_insertelement_v4f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: SCRATCH_RSRC_DWORD + +; Stack store +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} + +; Write element +; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; Stack reload +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; Store result +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm +; SI: ScratchSize: 64 + define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 @@ -236,15 +280,26 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d } ; SI-LABEL: {{^}}dynamic_insertelement_v8f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: SCRATCH_RSRC_DWORD + +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}} +; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}} + +; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} + +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm +; SI: ScratchSize: 128 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll index fec27e7168c..12d11ccfe41 100644 --- a/test/CodeGen/AMDGPU/merge-stores.ll +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -191,9 +191,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx4 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 @@ -203,13 +201,8 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { } ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: -; XGCN: buffer_store_dwordx4 -; XGCN: buffer_store_dwordx4 - -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll index 712205c2ce2..d5e10d0be88 100644 --- a/test/CodeGen/AMDGPU/reorder-stores.ll +++ b/test/CodeGen/AMDGPU/reorder-stores.ll @@ -2,14 +2,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 -- 2.34.1