1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
7 ; because that is slower than two 16-byte loads.
8 ; Other AVX-capable chips don't have that problem.
10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
11 ; CHECK-LABEL: load32bytes
23 %A = load <8 x float>* %Ap, align 16
27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
28 ; because that is slowerthan two 16-byte stores.
29 ; Other AVX-capable chips don't have that problem.
31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
32 ; CHECK-LABEL: store32bytes
34 ; SANDYB: vextractf128
44 store <8 x float> %A, <8 x float>* %P, align 16
48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
51 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
53 ; Use the vinsertf128 intrinsic to model source code
54 ; that explicitly uses AVX intrinsics.
55 define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
56 ; CHECK-LABEL: combine_16_byte_loads
59 ; SANDYB-NEXT: vinsertf128
68 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
69 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
70 %v1 = load <4 x float>* %ptr1, align 1
71 %v2 = load <4 x float>* %ptr2, align 1
72 %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
73 %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
77 ; Swap the operands of the shufflevector and vinsertf128 to ensure that the
78 ; pattern still matches.
79 define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
80 ; CHECK-LABEL: combine_16_byte_loads_swap
83 ; SANDYB-NEXT: vinsertf128
92 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
93 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
94 %v1 = load <4 x float>* %ptr1, align 1
95 %v2 = load <4 x float>* %ptr2, align 1
96 %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
97 %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
101 ; Replace the vinsertf128 intrinsic with a shufflevector as might be
102 ; expected from auto-vectorized code.
103 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
104 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
107 ; SANDYB-NEXT: vinsertf128
116 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
117 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
118 %v1 = load <4 x float>* %ptr1, align 1
119 %v2 = load <4 x float>* %ptr2, align 1
120 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
124 ; Swap the order of the shufflevector operands to ensure that the
125 ; pattern still matches.
126 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
127 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
130 ; SANDYB-NEXT: vinsertf128
139 %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
140 %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
141 %v1 = load <4 x float>* %ptr1, align 1
142 %v2 = load <4 x float>* %ptr2, align 1
143 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
147 ; Check each element type other than float to make sure it is handled correctly.
148 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
149 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for
150 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
152 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
153 ; CHECK-LABEL: combine_16_byte_loads_i64
155 ; SANDYB: vextractf128
156 ; SANDYB-NEXT: vpaddq
157 ; SANDYB-NEXT: vpaddq
158 ; SANDYB-NEXT: vinsertf128
161 ; BTVER2: vextractf128
162 ; BTVER2-NEXT: vpaddq
163 ; BTVER2-NEXT: vpaddq
164 ; BTVER2-NEXT: vinsertf128
167 ; HASWELL-NOT: vextract
171 %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
172 %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
173 %v1 = load <2 x i64>* %ptr1, align 1
174 %v2 = load <2 x i64>* %ptr2, align 1
175 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176 %v4 = add <4 x i64> %v3, %x
180 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
181 ; CHECK-LABEL: combine_16_byte_loads_i32
183 ; SANDYB: vextractf128
184 ; SANDYB-NEXT: vpaddd
185 ; SANDYB-NEXT: vpaddd
186 ; SANDYB-NEXT: vinsertf128
189 ; BTVER2: vextractf128
190 ; BTVER2-NEXT: vpaddd
191 ; BTVER2-NEXT: vpaddd
192 ; BTVER2-NEXT: vinsertf128
195 ; HASWELL-NOT: vextract
199 %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
200 %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
201 %v1 = load <4 x i32>* %ptr1, align 1
202 %v2 = load <4 x i32>* %ptr2, align 1
203 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
204 %v4 = add <8 x i32> %v3, %x
208 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
209 ; CHECK-LABEL: combine_16_byte_loads_i16
211 ; SANDYB: vextractf128
212 ; SANDYB-NEXT: vpaddw
213 ; SANDYB-NEXT: vpaddw
214 ; SANDYB-NEXT: vinsertf128
217 ; BTVER2: vextractf128
218 ; BTVER2-NEXT: vpaddw
219 ; BTVER2-NEXT: vpaddw
220 ; BTVER2-NEXT: vinsertf128
223 ; HASWELL-NOT: vextract
227 %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
228 %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
229 %v1 = load <8 x i16>* %ptr1, align 1
230 %v2 = load <8 x i16>* %ptr2, align 1
231 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
232 %v4 = add <16 x i16> %v3, %x
236 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
237 ; CHECK-LABEL: combine_16_byte_loads_i8
239 ; SANDYB: vextractf128
240 ; SANDYB-NEXT: vpaddb
241 ; SANDYB-NEXT: vpaddb
242 ; SANDYB-NEXT: vinsertf128
245 ; BTVER2: vextractf128
246 ; BTVER2-NEXT: vpaddb
247 ; BTVER2-NEXT: vpaddb
248 ; BTVER2-NEXT: vinsertf128
251 ; HASWELL-NOT: vextract
255 %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
256 %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
257 %v1 = load <16 x i8>* %ptr1, align 1
258 %v2 = load <16 x i8>* %ptr2, align 1
259 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
260 %v4 = add <32 x i8> %v3, %x
264 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
265 ; CHECK-LABEL: combine_16_byte_loads_double
268 ; SANDYB-NEXT: vinsertf128
269 ; SANDYB-NEXT: vaddpd
272 ; BTVER2-NOT: vinsertf128
276 ; HASWELL-NOT: vinsertf128
280 %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
281 %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
282 %v1 = load <2 x double>* %ptr1, align 1
283 %v2 = load <2 x double>* %ptr2, align 1
284 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285 %v4 = fadd <4 x double> %v3, %x