1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK
6 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
7 ; because that is slower than two 16-byte loads.
8 ; Other AVX-capable chips don't have that problem.
10 define <8 x float> @load32bytes(<8 x float>* %Ap) {
11 ; CHECK-LABEL: load32bytes
23 %A = load <8 x float>, <8 x float>* %Ap, align 16
27 ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
28 ; because that is slowerthan two 16-byte stores.
29 ; Other AVX-capable chips don't have that problem.
31 define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
32 ; CHECK-LABEL: store32bytes
34 ; SANDYB: vextractf128
44 store <8 x float> %A, <8 x float>* %P, align 16
48 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
51 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
52 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
55 ; SANDYB-NEXT: vinsertf128
64 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
65 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
66 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
67 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
68 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72 ; Swap the order of the shufflevector operands to ensure that the
73 ; pattern still matches.
74 define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
75 ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap
78 ; SANDYB-NEXT: vinsertf128
87 %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
88 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
89 %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
90 %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
91 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
95 ; Check each element type other than float to make sure it is handled correctly.
96 ; Use the loaded values with an 'add' to make sure we're using the correct load type.
97 ; Even though BtVer2 has fast 32-byte loads, we should not generate those for
98 ; 256-bit integer vectors because BtVer2 doesn't have AVX2.
100 define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
101 ; CHECK-LABEL: combine_16_byte_loads_i64
103 ; SANDYB: vextractf128
104 ; SANDYB-NEXT: vpaddq
105 ; SANDYB-NEXT: vpaddq
106 ; SANDYB-NEXT: vinsertf128
109 ; BTVER2: vextractf128
110 ; BTVER2-NEXT: vpaddq
111 ; BTVER2-NEXT: vpaddq
112 ; BTVER2-NEXT: vinsertf128
115 ; HASWELL-NOT: vextract
119 %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5
120 %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6
121 %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1
122 %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1
123 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
124 %v4 = add <4 x i64> %v3, %x
128 define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
129 ; CHECK-LABEL: combine_16_byte_loads_i32
131 ; SANDYB: vextractf128
132 ; SANDYB-NEXT: vpaddd
133 ; SANDYB-NEXT: vpaddd
134 ; SANDYB-NEXT: vinsertf128
137 ; BTVER2: vextractf128
138 ; BTVER2-NEXT: vpaddd
139 ; BTVER2-NEXT: vpaddd
140 ; BTVER2-NEXT: vinsertf128
143 ; HASWELL-NOT: vextract
147 %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6
148 %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7
149 %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1
150 %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1
151 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
152 %v4 = add <8 x i32> %v3, %x
156 define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
157 ; CHECK-LABEL: combine_16_byte_loads_i16
159 ; SANDYB: vextractf128
160 ; SANDYB-NEXT: vpaddw
161 ; SANDYB-NEXT: vpaddw
162 ; SANDYB-NEXT: vinsertf128
165 ; BTVER2: vextractf128
166 ; BTVER2-NEXT: vpaddw
167 ; BTVER2-NEXT: vpaddw
168 ; BTVER2-NEXT: vinsertf128
171 ; HASWELL-NOT: vextract
175 %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7
176 %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8
177 %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1
178 %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1
179 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
180 %v4 = add <16 x i16> %v3, %x
184 define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
185 ; CHECK-LABEL: combine_16_byte_loads_i8
187 ; SANDYB: vextractf128
188 ; SANDYB-NEXT: vpaddb
189 ; SANDYB-NEXT: vpaddb
190 ; SANDYB-NEXT: vinsertf128
193 ; BTVER2: vextractf128
194 ; BTVER2-NEXT: vpaddb
195 ; BTVER2-NEXT: vpaddb
196 ; BTVER2-NEXT: vinsertf128
199 ; HASWELL-NOT: vextract
203 %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8
204 %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9
205 %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1
206 %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1
207 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
208 %v4 = add <32 x i8> %v3, %x
212 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
213 ; CHECK-LABEL: combine_16_byte_loads_double
216 ; SANDYB-NEXT: vinsertf128
217 ; SANDYB-NEXT: vaddpd
220 ; BTVER2-NOT: vinsertf128
224 ; HASWELL-NOT: vinsertf128
228 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9
229 %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10
230 %v1 = load <2 x double>, <2 x double>* %ptr1, align 1
231 %v2 = load <2 x double>, <2 x double>* %ptr2, align 1
232 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
233 %v4 = fadd <4 x double> %v3, %x