test/CodeGen/X86/unaligned-32-byte-memops.ll

   1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
   2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
   3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
   5
   6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
   7 ; because that is slower than two 16-byte loads.
   8 ; Other AVX-capable chips don't have that problem.
   9
  10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
  11   ; CHECK-LABEL: load32bytes
  12
  13   ; SANDYB: vmovaps
  14   ; SANDYB: vinsertf128
  15   ; SANDYB: retq
  16
  17   ; BTVER2: vmovups
  18   ; BTVER2: retq
  19
  20   ; HASWELL: vmovups
  21   ; HASWELL: retq
  22
  23   %A = load <8 x float>* %Ap, align 16
  24   ret <8 x float> %A
  25 }
  26
  27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
  28 ; because that is slowerthan two 16-byte stores.
  29 ; Other AVX-capable chips don't have that problem.
  30
  31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
  32   ; CHECK-LABEL: store32bytes
  33
  34   ; SANDYB: vextractf128
  35   ; SANDYB: vmovaps
  36   ; SANDYB: retq
  37
  38   ; BTVER2: vmovups
  39   ; BTVER2: retq
  40
  41   ; HASWELL: vmovups
  42   ; HASWELL: retq
  43
  44   store <8 x float> %A, <8 x float>* %P, align 16
  45   ret void
  46 }
  47
  48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
  49 ; if it's faster.
  50
  51 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
  52
  53 ; Use the vinsertf128 intrinsic to model source code
  54 ; that explicitly uses AVX intrinsics.
  55 define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
  56   ; CHECK-LABEL: combine_16_byte_loads
  57
  58   ; SANDYB: vmovups
  59   ; SANDYB-NEXT: vinsertf128
  60   ; SANDYB-NEXT: retq
  61
  62   ; BTVER2: vmovups
  63   ; BTVER2-NEXT: retq
  64
  65   ; HASWELL: vmovups
  66   ; HASWELL-NEXT: retq
  67
  68   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
  69   %v1 = load <4 x float>* %ptr, align 1
  70   %v2 = load <4 x float>* %ptr2, align 1
  71   %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  72   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
  73   ret <8 x float> %v3
  74 }
  75
  76 ; Swap the operands of the shufflevector and vinsertf128 to ensure that the
  77 ; pattern still matches.
  78 define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
  79   ; CHECK-LABEL: combine_16_byte_loads_swap
  80
  81   ; SANDYB: vmovups
  82   ; SANDYB-NEXT: vinsertf128
  83   ; SANDYB-NEXT: retq
  84
  85   ; BTVER2: vmovups
  86   ; BTVER2-NEXT: retq
  87
  88   ; HASWELL: vmovups
  89   ; HASWELL-NEXT: retq
  90
  91   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
  92   %v1 = load <4 x float>* %ptr, align 1
  93   %v2 = load <4 x float>* %ptr2, align 1
  94   %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
  95   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
  96   ret <8 x float> %v3
  97 }
  98
  99 ; Replace the vinsertf128 intrinsic with a shufflevector as might be
 100 ; expected from auto-vectorized code.
 101 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
 102   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
 103
 104   ; SANDYB: vmovups
 105   ; SANDYB-NEXT: vinsertf128
 106   ; SANDYB-NEXT: retq
 107
 108   ; BTVER2: vmovups
 109   ; BTVER2-NEXT: retq
 110
 111   ; HASWELL: vmovups
 112   ; HASWELL-NEXT: retq
 113
 114   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
 115   %v1 = load <4 x float>* %ptr, align 1
 116   %v2 = load <4 x float>* %ptr2, align 1
 117   %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 118   ret <8 x float> %v3
 119 }
 120
 121 ; Swap the order of the shufflevector operands to ensure that the
 122 ; pattern still matches.
 123 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
 124   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
 125
 126   ; SANDYB: vmovups
 127   ; SANDYB-NEXT: vinsertf128
 128   ; SANDYB-NEXT: retq
 129
 130   ; BTVER2: vmovups
 131   ; BTVER2-NEXT: retq
 132
 133   ; HASWELL: vmovups
 134   ; HASWELL-NEXT: retq
 135
 136   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
 137   %v1 = load <4 x float>* %ptr, align 1
 138   %v2 = load <4 x float>* %ptr2, align 1
 139   %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 140   ret <8 x float> %v3
 141 }
 142
 143 ; Check each element type other than float to make sure it is handled correctly.
 144 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
 145 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for
 146 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
 147
 148 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
 149   ; CHECK-LABEL: combine_16_byte_loads_i64
 150
 151   ; SANDYB: vextractf128
 152   ; SANDYB-NEXT: vpaddq
 153   ; SANDYB-NEXT: vpaddq
 154   ; SANDYB-NEXT: vinsertf128
 155   ; SANDYB-NEXT: retq
 156
 157   ; BTVER2: vextractf128
 158   ; BTVER2-NEXT: vpaddq
 159   ; BTVER2-NEXT: vpaddq
 160   ; BTVER2-NEXT: vinsertf128
 161   ; BTVER2-NEXT: retq
 162
 163   ; HASWELL: vmovdqu
 164   ; HASWELL-NEXT: vpaddq
 165   ; HASWELL-NEXT: retq
 166
 167   %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1
 168   %v1 = load <2 x i64>* %ptr, align 1
 169   %v2 = load <2 x i64>* %ptr2, align 1
 170   %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 171   %v4 = add <4 x i64> %v3, %x
 172   ret <4 x i64> %v4
 173 }
 174
 175 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
 176   ; CHECK-LABEL: combine_16_byte_loads_i32
 177
 178   ; SANDYB: vextractf128
 179   ; SANDYB-NEXT: vpaddd
 180   ; SANDYB-NEXT: vpaddd
 181   ; SANDYB-NEXT: vinsertf128
 182   ; SANDYB-NEXT: retq
 183
 184   ; BTVER2: vextractf128
 185   ; BTVER2-NEXT: vpaddd
 186   ; BTVER2-NEXT: vpaddd
 187   ; BTVER2-NEXT: vinsertf128
 188   ; BTVER2-NEXT: retq
 189
 190   ; HASWELL: vmovdqu
 191   ; HASWELL-NEXT: vpaddd
 192   ; HASWELL-NEXT: retq
 193
 194   %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1
 195   %v1 = load <4 x i32>* %ptr, align 1
 196   %v2 = load <4 x i32>* %ptr2, align 1
 197   %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 198   %v4 = add <8 x i32> %v3, %x
 199   ret <8 x i32> %v4
 200 }
 201
 202 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
 203   ; CHECK-LABEL: combine_16_byte_loads_i16
 204
 205   ; SANDYB: vextractf128
 206   ; SANDYB-NEXT: vpaddw
 207   ; SANDYB-NEXT: vpaddw
 208   ; SANDYB-NEXT: vinsertf128
 209   ; SANDYB-NEXT: retq
 210
 211   ; BTVER2: vextractf128
 212   ; BTVER2-NEXT: vpaddw
 213   ; BTVER2-NEXT: vpaddw
 214   ; BTVER2-NEXT: vinsertf128
 215   ; BTVER2-NEXT: retq
 216
 217   ; HASWELL: vmovdqu
 218   ; HASWELL-NEXT: vpaddw
 219   ; HASWELL-NEXT: retq
 220
 221   %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1
 222   %v1 = load <8 x i16>* %ptr, align 1
 223   %v2 = load <8 x i16>* %ptr2, align 1
 224   %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 225   %v4 = add <16 x i16> %v3, %x
 226   ret <16 x i16> %v4
 227 }
 228
 229 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
 230   ; CHECK-LABEL: combine_16_byte_loads_i8
 231
 232   ; SANDYB: vextractf128
 233   ; SANDYB-NEXT: vpaddb
 234   ; SANDYB-NEXT: vpaddb
 235   ; SANDYB-NEXT: vinsertf128
 236   ; SANDYB-NEXT: retq
 237
 238   ; BTVER2: vextractf128
 239   ; BTVER2-NEXT: vpaddb
 240   ; BTVER2-NEXT: vpaddb
 241   ; BTVER2-NEXT: vinsertf128
 242   ; BTVER2-NEXT: retq
 243
 244   ; HASWELL: vmovdqu
 245   ; HASWELL-NEXT: vpaddb
 246   ; HASWELL-NEXT: retq
 247
 248   %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1
 249   %v1 = load <16 x i8>* %ptr, align 1
 250   %v2 = load <16 x i8>* %ptr2, align 1
 251   %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 252   %v4 = add <32 x i8> %v3, %x
 253   ret <32 x i8> %v4
 254 }
 255
 256 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
 257   ; CHECK-LABEL: combine_16_byte_loads_double
 258
 259   ; SANDYB: vmovupd
 260   ; SANDYB-NEXT: vinsertf128
 261   ; SANDYB-NEXT: vaddpd
 262   ; SANDYB-NEXT: retq
 263
 264   ; BTVER2: vmovupd
 265   ; BTVER2-NEXT: vaddpd
 266   ; BTVER2-NEXT: retq
 267
 268   ; HASWELL: vmovupd
 269   ; HASWELL: vaddpd
 270   ; HASWELL-NEXT: retq
 271
 272   %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1
 273   %v1 = load <2 x double>* %ptr, align 1
 274   %v2 = load <2 x double>* %ptr2, align 1
 275   %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 276   %v4 = fadd <4 x double> %v3, %x
 277   ret <4 x double> %v4
 278 }
 279