R600/SI: Fix bad code with unaligned byte vector loads

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 14 Jan 2015 01:35:22 +0000 (01:35 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 14 Jan 2015 01:35:22 +0000 (01:35 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 14 Jan 2015 01:35:22 +0000 (01:35 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 14 Jan 2015 01:35:22 +0000 (01:35 +0000)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp

index e7f9788496cacff809542b67f6bb88cf18574f31..0a3fa2f930d7e725ea8f5862d6d1cb3e415f9dbd 100644 (file)
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -302,7 +302,7 @@ bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
    return true;
  }
  
-bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                        unsigned AddrSpace,
                                                        unsigned Align,
                                                        bool *IsFast) const {
@@ -1167,7 +1167,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
  //===----------------------------------------------------------------------===//
  
  SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
-                                                     DAGCombinerInfo &DCI) {
+                                                     DAGCombinerInfo &DCI) const {
    EVT VT = N->getValueType(0);
    EVT ScalarVT = VT.getScalarType();
    if (ScalarVT != MVT::f32)
@@ -1215,8 +1215,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
      EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
      EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
      EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-
      LoadSDNode *Load = cast<LoadSDNode>(Src);
+
+    unsigned AS = Load->getAddressSpace();
+    unsigned Align = Load->getAlignment();
+    Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+
+    // Don't try to replace the load if we have to expand it due to alignment
+    // problems. Otherwise we will end up scalarizing the load, and trying to
+    // repack into the vector for no real reason.
+    if (Align < ABIAlignment &&
+        !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
+      return SDValue();
+    }
+
      SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
                                       Load->getChain(),
                                       Load->getBasePtr(),
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h

index dc218202b12fbc62b7bdfc7188f87cfac85ae954..876fd8c9f369a003c0fcc918b9f70904fa091700 100644 (file)
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -50,8 +50,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
    void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
    MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
  
-  static SDValue performUCharToFloatCombine(SDNode *N,
-                                            DAGCombinerInfo &DCI);
+  SDValue performUCharToFloatCombine(SDNode *N,
+                                     DAGCombinerInfo &DCI) const;
    SDValue performSHLPtrCombine(SDNode *N,
                                 unsigned AS,
                                 DAGCombinerInfo &DCI) const;
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll

index 52bcf5d1d679aae4d9deb673b7a36c56495387b4..e26ee12f6f6da230ef1462a57eb0cce1c2fb4320 100644 (file)
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
  ; SI-DAG: v_cvt_f32_ubyte0_e32
  ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <3 x i8> addrspace(1)* %in, align 1
+  %load = load <3 x i8> addrspace(1)* %in, align 4
    %cvt = uitofp <3 x i8> %load to <3 x float>
    store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
    ret void
@@ -66,23 +66,13 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
  ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
  ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
  ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
+; SI-NOT: v_lshlrev_b32
+; SI-NOT: v_or_b32
  
-; SI: v_lshlrev_b32
-; SI: v_or_b32
-; SI: v_lshlrev_b32
-; SI: v_or_b32
-; SI: v_lshlrev_b32
-; SI: v_or_b32
-
-; XSI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG0]]
-; XSI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
-; XSI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
-; XSI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG3]]
-
-; SI-DAG: v_cvt_f32_ubyte0_e32
-; SI-DAG: v_cvt_f32_ubyte1_e32
-; SI-DAG: v_cvt_f32_ubyte2_e32
-; SI-DAG: v_cvt_f32_ubyte3_e32
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
  
  ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 14 Jan 2015 01:35:22 +0000 (01:35 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 14 Jan 2015 01:35:22 +0000 (01:35 +0000)
lib/Target/R600/SIISelLowering.cpp		patch \| blob \| history
lib/Target/R600/SIISelLowering.h		patch \| blob \| history
test/CodeGen/R600/cvt_f32_ubyte.ll		patch \| blob \| history