R600/SI: 64-bit and larger memory access must be at least 4-byte aligned

author Tom Stellard <thomas.stellard@amd.com>

Mon, 2 Feb 2015 18:02:28 +0000 (18:02 +0000)

committer Tom Stellard <thomas.stellard@amd.com>

Mon, 2 Feb 2015 18:02:28 +0000 (18:02 +0000)
author Tom Stellard <thomas.stellard@amd.com>
Mon, 2 Feb 2015 18:02:28 +0000 (18:02 +0000)
committer Tom Stellard <thomas.stellard@amd.com>
Mon, 2 Feb 2015 18:02:28 +0000 (18:02 +0000)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp

index ef9f8f6ee769e530d05d4d695014f9b8b2c5e03d..6ff17ec1f067bb009c82a36e8e9397c7500b0fb4 100644 (file)
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -315,9 +315,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    if (!VT.isSimple() || VT == MVT::Other)
      return false;
  
-  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
-  // see what for specifically. The wording everywhere else seems to be the
-  // same.
+  // TODO - CI+ supports unaligned memory accesses, but this requires driver
+  // support.
  
    // XXX - The only mention I see of this in the ISA manual is for LDS direct
    // reads the "byte address and must be dword aligned". Is it also true for the
@@ -334,7 +333,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    // This applies to private, global, and constant memory.
    if (IsFast)
      *IsFast = true;
-  return VT.bitsGT(MVT::i32);
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
  }
  
  EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll

index 710a40021575fd624a230f93261dcc14658f002c..4d4bf934d0d8fa454fce6b45fc935fd78f847180 100644 (file)
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -146,7 +146,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
  ; SI: buffer_store_dword
  ; SI: buffer_store_dword
  define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <8 x i8> addrspace(1)* %in, align 1
+  %load = load <8 x i8> addrspace(1)* %in, align 8
    %cvt = uitofp <8 x i8> %load to <8 x float>
    store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
    ret void
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll

index ed7cf520e201b52371dd5a4baed922ec4a808d4c..1187ff246f65261bc15c722db56212a0a10b92fb 100644 (file)
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -1,18 +1,65 @@
  ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
  ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
  
-; SI-LABEL: {{^}}unaligned_load_store_i32:
+; SI-LABEL: {{^}}unaligned_load_store_i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
  ; SI: ds_read_u8
  ; SI: ds_read_u8
  ; SI: ds_write_b32
  ; SI: s_endpgm
-define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
    %v = load i32 addrspace(3)* %p, align 1
    store i32 %v, i32 addrspace(3)* %r, align 1
    ret void
  }
  
-; SI-LABEL: {{^}}unaligned_load_store_v4i32:
+; SI-LABEL: {{^}}unaligned_load_store_i32_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+  %v = load i32 addrspace(1)* %p, align 1
+  store i32 %v, i32 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write2_b32
+; SI: s_endpgm
+define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
+  %v = load i64 addrspace(3)* %p, align 1
+  store i64 %v, i64 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
+  %v = load i64 addrspace(1)* %p, align 1
+  store i64 %v, i64 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
  ; SI: ds_read_u8
  ; SI: ds_read_u8
  ; SI: ds_read_u8
@@ -38,12 +85,36 @@ define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r
  ; SI: ds_write_b32
  ; SI: ds_write_b32
  ; SI: s_endpgm
-define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
    %v = load <4 x i32> addrspace(3)* %p, align 1
    store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
    ret void
  }
  
+; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
+; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
+  %v = load <4 x i32> addrspace(1)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
+  ret void
+}
+
  ; SI-LABEL: {{^}}load_lds_i64_align_4:
  ; SI: ds_read2_b32
  ; SI: s_endpgm
author	Tom Stellard <thomas.stellard@amd.com>
	Mon, 2 Feb 2015 18:02:28 +0000 (18:02 +0000)
committer	Tom Stellard <thomas.stellard@amd.com>
	Mon, 2 Feb 2015 18:02:28 +0000 (18:02 +0000)
lib/Target/R600/SIISelLowering.cpp		patch \| blob \| history
test/CodeGen/R600/cvt_f32_ubyte.ll		patch \| blob \| history
test/CodeGen/R600/unaligned-load-store.ll		patch \| blob \| history