test/CodeGen/R600/cvt_f32_ubyte.ll

   1 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
   2
   3 ; SI-LABEL: {{^}}load_i8_to_f32:
   4 ; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
   5 ; SI-NOT: bfe
   6 ; SI-NOT: lshr
   7 ; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
   8 ; SI: buffer_store_dword [[CONV]],
   9 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
  10   %load = load i8 addrspace(1)* %in, align 1
  11   %cvt = uitofp i8 %load to float
  12   store float %cvt, float addrspace(1)* %out, align 4
  13   ret void
  14 }
  15
  16 ; SI-LABEL: {{^}}load_v2i8_to_v2f32:
  17 ; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
  18 ; SI-NOT: bfe
  19 ; SI-NOT: lshr
  20 ; SI-NOT: and
  21 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
  22 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  23 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  24 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
  25   %load = load <2 x i8> addrspace(1)* %in, align 2
  26   %cvt = uitofp <2 x i8> %load to <2 x float>
  27   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
  28   ret void
  29 }
  30
  31 ; SI-LABEL: {{^}}load_v3i8_to_v3f32:
  32 ; SI-NOT: bfe
  33 ; SI-NOT: v_cvt_f32_ubyte3_e32
  34 ; SI-DAG: v_cvt_f32_ubyte2_e32
  35 ; SI-DAG: v_cvt_f32_ubyte1_e32
  36 ; SI-DAG: v_cvt_f32_ubyte0_e32
  37 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  38 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
  39   %load = load <3 x i8> addrspace(1)* %in, align 1
  40   %cvt = uitofp <3 x i8> %load to <3 x float>
  41   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
  42   ret void
  43 }
  44
  45 ; SI-LABEL: {{^}}load_v4i8_to_v4f32:
  46 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
  47 ; SI-NOT: bfe
  48 ; SI-NOT: lshr
  49 ; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
  50 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
  51 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
  52 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  53 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  54 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  55   %load = load <4 x i8> addrspace(1)* %in, align 4
  56   %cvt = uitofp <4 x i8> %load to <4 x float>
  57   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
  58   ret void
  59 }
  60
  61 ; This should not be adding instructions to shift into the correct
  62 ; position in the word for the component.
  63
  64 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
  65 ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
  66 ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
  67 ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
  68 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
  69
  70 ; SI: v_lshlrev_b32
  71 ; SI: v_or_b32
  72 ; SI: v_lshlrev_b32
  73 ; SI: v_or_b32
  74 ; SI: v_lshlrev_b32
  75 ; SI: v_or_b32
  76
  77 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG0]]
  78 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
  79 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
  80 ; XSI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG3]]
  81
  82 ; SI-DAG: v_cvt_f32_ubyte0_e32
  83 ; SI-DAG: v_cvt_f32_ubyte1_e32
  84 ; SI-DAG: v_cvt_f32_ubyte2_e32
  85 ; SI-DAG: v_cvt_f32_ubyte3_e32
  86
  87 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  88 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  89   %load = load <4 x i8> addrspace(1)* %in, align 1
  90   %cvt = uitofp <4 x i8> %load to <4 x float>
  91   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
  92   ret void
  93 }
  94
  95 ; XXX - This should really still be able to use the v_cvt_f32_ubyte0
  96 ; for each component, but computeKnownBits doesn't handle vectors very
  97 ; well.
  98
  99 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
 100 ; SI: buffer_load_ubyte
 101 ; SI: buffer_load_ubyte
 102 ; SI: buffer_load_ubyte
 103 ; SI: buffer_load_ubyte
 104 ; SI: v_cvt_f32_ubyte0_e32
 105 ; SI: v_cvt_f32_ubyte0_e32
 106 ; SI: v_cvt_f32_ubyte0_e32
 107 ; SI: v_cvt_f32_ubyte0_e32
 108
 109 ; XXX - replace with this when v4i8 loads aren't scalarized anymore.
 110 ; XSI: buffer_load_dword
 111 ; XSI: v_cvt_f32_u32_e32
 112 ; XSI: v_cvt_f32_u32_e32
 113 ; XSI: v_cvt_f32_u32_e32
 114 ; XSI: v_cvt_f32_u32_e32
 115 ; SI: s_endpgm
 116 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
 117   %load = load <4 x i8> addrspace(1)* %in, align 4
 118   %cvt = uitofp <4 x i8> %load to <4 x float>
 119   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
 120   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
 121   store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
 122   ret void
 123 }
 124
 125 ; Make sure this doesn't crash.
 126 ; SI-LABEL: {{^}}load_v7i8_to_v7f32:
 127 ; SI: s_endpgm
 128 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
 129   %load = load <7 x i8> addrspace(1)* %in, align 1
 130   %cvt = uitofp <7 x i8> %load to <7 x float>
 131   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
 132   ret void
 133 }
 134
 135 ; SI-LABEL: {{^}}load_v8i8_to_v8f32:
 136 ; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
 137 ; SI-NOT: bfe
 138 ; SI-NOT: lshr
 139 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
 140 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
 141 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
 142 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
 143 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
 144 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
 145 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
 146 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
 147 ; SI-NOT: bfe
 148 ; SI-NOT: lshr
 149 ; SI: buffer_store_dword
 150 ; SI: buffer_store_dword
 151 ; SI: buffer_store_dword
 152 ; SI: buffer_store_dword
 153 ; SI: buffer_store_dword
 154 ; SI: buffer_store_dword
 155 ; SI: buffer_store_dword
 156 ; SI: buffer_store_dword
 157 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
 158   %load = load <8 x i8> addrspace(1)* %in, align 1
 159   %cvt = uitofp <8 x i8> %load to <8 x float>
 160   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
 161   ret void
 162 }
 163
 164 ; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
 165 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
 166 ; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
 167 ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 168 ; SI: buffer_store_dword [[CONV]],
 169 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
 170   %load = load i32 addrspace(1)* %in, align 4
 171   %add = add i32 %load, 2
 172   %inreg = and i32 %add, 255
 173   %cvt = uitofp i32 %inreg to float
 174   store float %cvt, float addrspace(1)* %out, align 4
 175   ret void
 176 }
 177
 178 ; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
 179 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
 180   %load = load i32 addrspace(1)* %in, align 4
 181   %inreg = and i32 %load, 65280
 182   %shr = lshr i32 %inreg, 8
 183   %cvt = uitofp i32 %shr to float
 184   store float %cvt, float addrspace(1)* %out, align 4
 185   ret void
 186 }
 187
 188
 189 ; We don't get these ones because of the zext, but instcombine removes
 190 ; them so it shouldn't really matter.
 191 define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
 192   %load = load i8 addrspace(1)* %in, align 1
 193   %ext = zext i8 %load to i32
 194   %cvt = uitofp i32 %ext to float
 195   store float %cvt, float addrspace(1)* %out, align 4
 196   ret void
 197 }
 198
 199 define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
 200   %load = load <4 x i8> addrspace(1)* %in, align 1
 201   %ext = zext <4 x i8> %load to <4 x i32>
 202   %cvt = uitofp <4 x i32> %ext to <4 x float>
 203   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
 204   ret void
 205 }