test/CodeGen/X86/unaligned-32-byte-memops.ll

   1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
   2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
   3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
   5
   6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
   7 ; because that is slower than two 16-byte loads.
   8 ; Other AVX-capable chips don't have that problem.
   9
  10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
  11   ; CHECK-LABEL: load32bytes
  12
  13   ; SANDYB: vmovaps
  14   ; SANDYB: vinsertf128
  15   ; SANDYB: retq
  16
  17   ; BTVER2: vmovups
  18   ; BTVER2: retq
  19
  20   ; HASWELL: vmovups
  21   ; HASWELL: retq
  22
  23   %A = load <8 x float>* %Ap, align 16
  24   ret <8 x float> %A
  25 }
  26
  27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
  28 ; because that is slowerthan two 16-byte stores.
  29 ; Other AVX-capable chips don't have that problem.
  30
  31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
  32   ; CHECK-LABEL: store32bytes
  33
  34   ; SANDYB: vextractf128
  35   ; SANDYB: vmovaps
  36   ; SANDYB: retq
  37
  38   ; BTVER2: vmovups
  39   ; BTVER2: retq
  40
  41   ; HASWELL: vmovups
  42   ; HASWELL: retq
  43
  44   store <8 x float> %A, <8 x float>* %P, align 16
  45   ret void
  46 }
  47
  48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
  49 ; if it's faster.
  50
  51 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
  52
  53 ; Use the vinsertf128 intrinsic to model source code
  54 ; that explicitly uses AVX intrinsics.
  55 define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
  56   ; CHECK-LABEL: combine_16_byte_loads
  57
  58   ; SANDYB: vmovups
  59   ; SANDYB-NEXT: vinsertf128
  60   ; SANDYB-NEXT: retq
  61
  62   ; BTVER2: vmovups
  63   ; BTVER2-NEXT: retq
  64
  65   ; HASWELL: vmovups
  66   ; HASWELL-NEXT: retq
  67
  68   %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
  69   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
  70   %v1 = load <4 x float>* %ptr1, align 1
  71   %v2 = load <4 x float>* %ptr2, align 1
  72   %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  73   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
  74   ret <8 x float> %v3
  75 }
  76
  77 ; Swap the operands of the shufflevector and vinsertf128 to ensure that the
  78 ; pattern still matches.
  79 define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
  80   ; CHECK-LABEL: combine_16_byte_loads_swap
  81
  82   ; SANDYB: vmovups
  83   ; SANDYB-NEXT: vinsertf128
  84   ; SANDYB-NEXT: retq
  85
  86   ; BTVER2: vmovups
  87   ; BTVER2-NEXT: retq
  88
  89   ; HASWELL: vmovups
  90   ; HASWELL-NEXT: retq
  91
  92   %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
  93   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
  94   %v1 = load <4 x float>* %ptr1, align 1
  95   %v2 = load <4 x float>* %ptr2, align 1
  96   %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
  97   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
  98   ret <8 x float> %v3
  99 }
 100
 101 ; Replace the vinsertf128 intrinsic with a shufflevector as might be
 102 ; expected from auto-vectorized code.
 103 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
 104   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
 105
 106   ; SANDYB: vmovups
 107   ; SANDYB-NEXT: vinsertf128
 108   ; SANDYB-NEXT: retq
 109
 110   ; BTVER2: vmovups
 111   ; BTVER2-NEXT: retq
 112
 113   ; HASWELL: vmovups
 114   ; HASWELL-NEXT: retq
 115
 116   %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
 117   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
 118   %v1 = load <4 x float>* %ptr1, align 1
 119   %v2 = load <4 x float>* %ptr2, align 1
 120   %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 121   ret <8 x float> %v3
 122 }
 123
 124 ; Swap the order of the shufflevector operands to ensure that the
 125 ; pattern still matches.
 126 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
 127   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
 128
 129   ; SANDYB: vmovups
 130   ; SANDYB-NEXT: vinsertf128
 131   ; SANDYB-NEXT: retq
 132
 133   ; BTVER2: vmovups
 134   ; BTVER2-NEXT: retq
 135
 136   ; HASWELL: vmovups
 137   ; HASWELL-NEXT: retq
 138
 139   %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
 140   %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
 141   %v1 = load <4 x float>* %ptr1, align 1
 142   %v2 = load <4 x float>* %ptr2, align 1
 143   %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 144   ret <8 x float> %v3
 145 }
 146
 147 ; Check each element type other than float to make sure it is handled correctly.
 148 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
 149 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for
 150 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
 151
 152 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
 153   ; CHECK-LABEL: combine_16_byte_loads_i64
 154
 155   ; SANDYB: vextractf128
 156   ; SANDYB-NEXT: vpaddq
 157   ; SANDYB-NEXT: vpaddq
 158   ; SANDYB-NEXT: vinsertf128
 159   ; SANDYB-NEXT: retq
 160
 161   ; BTVER2: vextractf128
 162   ; BTVER2-NEXT: vpaddq
 163   ; BTVER2-NEXT: vpaddq
 164   ; BTVER2-NEXT: vinsertf128
 165   ; BTVER2-NEXT: retq
 166
 167   ; HASWELL-NOT: vextract
 168   ; HASWELL: vpaddq
 169   ; HASWELL-NEXT: retq
 170
 171   %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
 172   %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
 173   %v1 = load <2 x i64>* %ptr1, align 1
 174   %v2 = load <2 x i64>* %ptr2, align 1
 175   %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 176   %v4 = add <4 x i64> %v3, %x
 177   ret <4 x i64> %v4
 178 }
 179
 180 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
 181   ; CHECK-LABEL: combine_16_byte_loads_i32
 182
 183   ; SANDYB: vextractf128
 184   ; SANDYB-NEXT: vpaddd
 185   ; SANDYB-NEXT: vpaddd
 186   ; SANDYB-NEXT: vinsertf128
 187   ; SANDYB-NEXT: retq
 188
 189   ; BTVER2: vextractf128
 190   ; BTVER2-NEXT: vpaddd
 191   ; BTVER2-NEXT: vpaddd
 192   ; BTVER2-NEXT: vinsertf128
 193   ; BTVER2-NEXT: retq
 194
 195   ; HASWELL-NOT: vextract
 196   ; HASWELL: vpaddd
 197   ; HASWELL-NEXT: retq
 198
 199   %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
 200   %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
 201   %v1 = load <4 x i32>* %ptr1, align 1
 202   %v2 = load <4 x i32>* %ptr2, align 1
 203   %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 204   %v4 = add <8 x i32> %v3, %x
 205   ret <8 x i32> %v4
 206 }
 207
 208 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
 209   ; CHECK-LABEL: combine_16_byte_loads_i16
 210
 211   ; SANDYB: vextractf128
 212   ; SANDYB-NEXT: vpaddw
 213   ; SANDYB-NEXT: vpaddw
 214   ; SANDYB-NEXT: vinsertf128
 215   ; SANDYB-NEXT: retq
 216
 217   ; BTVER2: vextractf128
 218   ; BTVER2-NEXT: vpaddw
 219   ; BTVER2-NEXT: vpaddw
 220   ; BTVER2-NEXT: vinsertf128
 221   ; BTVER2-NEXT: retq
 222
 223   ; HASWELL-NOT: vextract
 224   ; HASWELL: vpaddw
 225   ; HASWELL-NEXT: retq
 226
 227   %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
 228   %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
 229   %v1 = load <8 x i16>* %ptr1, align 1
 230   %v2 = load <8 x i16>* %ptr2, align 1
 231   %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 232   %v4 = add <16 x i16> %v3, %x
 233   ret <16 x i16> %v4
 234 }
 235
 236 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
 237   ; CHECK-LABEL: combine_16_byte_loads_i8
 238
 239   ; SANDYB: vextractf128
 240   ; SANDYB-NEXT: vpaddb
 241   ; SANDYB-NEXT: vpaddb
 242   ; SANDYB-NEXT: vinsertf128
 243   ; SANDYB-NEXT: retq
 244
 245   ; BTVER2: vextractf128
 246   ; BTVER2-NEXT: vpaddb
 247   ; BTVER2-NEXT: vpaddb
 248   ; BTVER2-NEXT: vinsertf128
 249   ; BTVER2-NEXT: retq
 250
 251   ; HASWELL-NOT: vextract
 252   ; HASWELL: vpaddb
 253   ; HASWELL-NEXT: retq
 254
 255   %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
 256   %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
 257   %v1 = load <16 x i8>* %ptr1, align 1
 258   %v2 = load <16 x i8>* %ptr2, align 1
 259   %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 260   %v4 = add <32 x i8> %v3, %x
 261   ret <32 x i8> %v4
 262 }
 263
 264 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
 265   ; CHECK-LABEL: combine_16_byte_loads_double
 266
 267   ; SANDYB: vmovupd
 268   ; SANDYB-NEXT: vinsertf128
 269   ; SANDYB-NEXT: vaddpd
 270   ; SANDYB-NEXT: retq
 271
 272   ; BTVER2-NOT: vinsertf128
 273   ; BTVER2: vaddpd
 274   ; BTVER2-NEXT: retq
 275
 276   ; HASWELL-NOT: vinsertf128
 277   ; HASWELL: vaddpd
 278   ; HASWELL-NEXT: retq
 279
 280   %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
 281   %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
 282   %v1 = load <2 x double>* %ptr1, align 1
 283   %v2 = load <2 x double>* %ptr2, align 1
 284   %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 285   %v4 = fadd <4 x double> %v3, %x
 286   ret <4 x double> %v4
 287 }
 288