test/CodeGen/X86/unaligned-32-byte-memops.ll

   1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
   2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
   3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
   5
   6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
   7 ; because that is slower than two 16-byte loads.
   8 ; Other AVX-capable chips don't have that problem.
   9
  10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
  11   ; CHECK-LABEL: load32bytes
  12
  13   ; SANDYB: vmovaps
  14   ; SANDYB: vinsertf128
  15   ; SANDYB: retq
  16
  17   ; BTVER2: vmovups
  18   ; BTVER2: retq
  19
  20   ; HASWELL: vmovups
  21   ; HASWELL: retq
  22
  23   %A = load <8 x float>, <8 x float>* %Ap, align 16
  24   ret <8 x float> %A
  25 }
  26
  27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
  28 ; because that is slowerthan two 16-byte stores.
  29 ; Other AVX-capable chips don't have that problem.
  30
  31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
  32   ; CHECK-LABEL: store32bytes
  33
  34   ; SANDYB: vextractf128
  35   ; SANDYB: vmovaps
  36   ; SANDYB: retq
  37
  38   ; BTVER2: vmovups
  39   ; BTVER2: retq
  40
  41   ; HASWELL: vmovups
  42   ; HASWELL: retq
  43
  44   store <8 x float> %A, <8 x float>* %P, align 16
  45   ret void
  46 }
  47
  48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
  49 ; if it's faster.
  50
  51 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
  52   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
  53
  54   ; SANDYB: vmovups
  55   ; SANDYB-NEXT: vinsertf128
  56   ; SANDYB-NEXT: retq
  57
  58   ; BTVER2: vmovups
  59   ; BTVER2-NEXT: retq
  60
  61   ; HASWELL: vmovups
  62   ; HASWELL-NEXT: retq
  63
  64   %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
  65   %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
  66   %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
  67   %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
  68   %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  69   ret <8 x float> %v3
  70 }
  71
  72 ; Swap the order of the shufflevector operands to ensure that the
  73 ; pattern still matches.
  74 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
  75   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
  76
  77   ; SANDYB: vmovups
  78   ; SANDYB-NEXT: vinsertf128
  79   ; SANDYB-NEXT: retq
  80
  81   ; BTVER2: vmovups
  82   ; BTVER2-NEXT: retq
  83
  84   ; HASWELL: vmovups
  85   ; HASWELL-NEXT: retq
  86
  87   %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
  88   %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
  89   %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
  90   %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
  91   %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
  92   ret <8 x float> %v3
  93 }
  94
  95 ; Check each element type other than float to make sure it is handled correctly.
  96 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
  97 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for
  98 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
  99
 100 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
 101   ; CHECK-LABEL: combine_16_byte_loads_i64
 102
 103   ; SANDYB: vextractf128
 104   ; SANDYB-NEXT: vpaddq
 105   ; SANDYB-NEXT: vpaddq
 106   ; SANDYB-NEXT: vinsertf128
 107   ; SANDYB-NEXT: retq
 108
 109   ; BTVER2: vextractf128
 110   ; BTVER2-NEXT: vpaddq
 111   ; BTVER2-NEXT: vpaddq
 112   ; BTVER2-NEXT: vinsertf128
 113   ; BTVER2-NEXT: retq
 114
 115   ; HASWELL-NOT: vextract
 116   ; HASWELL: vpaddq
 117   ; HASWELL-NEXT: retq
 118
 119   %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5
 120   %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6
 121   %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1
 122   %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1
 123   %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 124   %v4 = add <4 x i64> %v3, %x
 125   ret <4 x i64> %v4
 126 }
 127
 128 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
 129   ; CHECK-LABEL: combine_16_byte_loads_i32
 130
 131   ; SANDYB: vextractf128
 132   ; SANDYB-NEXT: vpaddd
 133   ; SANDYB-NEXT: vpaddd
 134   ; SANDYB-NEXT: vinsertf128
 135   ; SANDYB-NEXT: retq
 136
 137   ; BTVER2: vextractf128
 138   ; BTVER2-NEXT: vpaddd
 139   ; BTVER2-NEXT: vpaddd
 140   ; BTVER2-NEXT: vinsertf128
 141   ; BTVER2-NEXT: retq
 142
 143   ; HASWELL-NOT: vextract
 144   ; HASWELL: vpaddd
 145   ; HASWELL-NEXT: retq
 146
 147   %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6
 148   %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7
 149   %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1
 150   %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1
 151   %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 152   %v4 = add <8 x i32> %v3, %x
 153   ret <8 x i32> %v4
 154 }
 155
 156 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
 157   ; CHECK-LABEL: combine_16_byte_loads_i16
 158
 159   ; SANDYB: vextractf128
 160   ; SANDYB-NEXT: vpaddw
 161   ; SANDYB-NEXT: vpaddw
 162   ; SANDYB-NEXT: vinsertf128
 163   ; SANDYB-NEXT: retq
 164
 165   ; BTVER2: vextractf128
 166   ; BTVER2-NEXT: vpaddw
 167   ; BTVER2-NEXT: vpaddw
 168   ; BTVER2-NEXT: vinsertf128
 169   ; BTVER2-NEXT: retq
 170
 171   ; HASWELL-NOT: vextract
 172   ; HASWELL: vpaddw
 173   ; HASWELL-NEXT: retq
 174
 175   %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7
 176   %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8
 177   %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1
 178   %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1
 179   %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 180   %v4 = add <16 x i16> %v3, %x
 181   ret <16 x i16> %v4
 182 }
 183
 184 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
 185   ; CHECK-LABEL: combine_16_byte_loads_i8
 186
 187   ; SANDYB: vextractf128
 188   ; SANDYB-NEXT: vpaddb
 189   ; SANDYB-NEXT: vpaddb
 190   ; SANDYB-NEXT: vinsertf128
 191   ; SANDYB-NEXT: retq
 192
 193   ; BTVER2: vextractf128
 194   ; BTVER2-NEXT: vpaddb
 195   ; BTVER2-NEXT: vpaddb
 196   ; BTVER2-NEXT: vinsertf128
 197   ; BTVER2-NEXT: retq
 198
 199   ; HASWELL-NOT: vextract
 200   ; HASWELL: vpaddb
 201   ; HASWELL-NEXT: retq
 202
 203   %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8
 204   %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9
 205   %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1
 206   %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1
 207   %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 208   %v4 = add <32 x i8> %v3, %x
 209   ret <32 x i8> %v4
 210 }
 211
 212 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
 213   ; CHECK-LABEL: combine_16_byte_loads_double
 214
 215   ; SANDYB: vmovupd
 216   ; SANDYB-NEXT: vinsertf128
 217   ; SANDYB-NEXT: vaddpd
 218   ; SANDYB-NEXT: retq
 219
 220   ; BTVER2-NOT: vinsertf128
 221   ; BTVER2: vaddpd
 222   ; BTVER2-NEXT: retq
 223
 224   ; HASWELL-NOT: vinsertf128
 225   ; HASWELL: vaddpd
 226   ; HASWELL-NEXT: retq
 227
 228   %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9
 229   %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10
 230   %v1 = load <2 x double>, <2 x double>* %ptr1, align 1
 231   %v2 = load <2 x double>, <2 x double>* %ptr2, align 1
 232   %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 233   %v4 = fadd <4 x double> %v3, %x
 234   ret <4 x double> %v4
 235 }
 236