1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
5 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
6 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
9 ; Verify the cost of vector shift left instructions.
11 ; We always emit a single pmullw in the case of v8i16 vector shifts by
12 ; non-uniform constant.
14 define <8 x i16> @test1(<8 x i16> %a) {
15 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
18 ; CHECK: 'Cost Model Analysis' for function 'test1':
19 ; CHECK: Found an estimated cost of 1 for instruction: %shl
22 define <8 x i16> @test2(<8 x i16> %a) {
23 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
26 ; CHECK: 'Cost Model Analysis' for function 'test2':
27 ; CHECK: Found an estimated cost of 1 for instruction: %shl
30 ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
31 ; Make sure that the estimated cost is always 1 except for the case where
32 ; we only have SSE2 support. With SSE2, we are forced to special lower the
33 ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
35 define <4 x i32> @test3(<4 x i32> %a) {
36 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
39 ; CHECK: 'Cost Model Analysis' for function 'test3':
40 ; SSE2: Found an estimated cost of 6 for instruction: %shl
41 ; SSE41: Found an estimated cost of 1 for instruction: %shl
42 ; AVX: Found an estimated cost of 1 for instruction: %shl
43 ; AVX2: Found an estimated cost of 1 for instruction: %shl
44 ; XOP: Found an estimated cost of 1 for instruction: %shl
47 define <4 x i32> @test4(<4 x i32> %a) {
48 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
51 ; CHECK: 'Cost Model Analysis' for function 'test4':
52 ; SSE2: Found an estimated cost of 6 for instruction: %shl
53 ; SSE41: Found an estimated cost of 1 for instruction: %shl
54 ; AVX: Found an estimated cost of 1 for instruction: %shl
55 ; AVX2: Found an estimated cost of 1 for instruction: %shl
56 ; XOP: Found an estimated cost of 1 for instruction: %shl
59 ; On AVX2 we are able to lower the following shift into a single
60 ; vpsllvq. Therefore, the expected cost is only 1.
61 ; In all other cases, this shift is scalarized as the target does not support
62 ; vpsllv instructions.
64 define <2 x i64> @test5(<2 x i64> %a) {
65 %shl = shl <2 x i64> %a, <i64 2, i64 3>
68 ; CHECK: 'Cost Model Analysis' for function 'test5':
69 ; SSE2: Found an estimated cost of 4 for instruction: %shl
70 ; SSE41: Found an estimated cost of 4 for instruction: %shl
71 ; AVX: Found an estimated cost of 4 for instruction: %shl
72 ; AVX2: Found an estimated cost of 1 for instruction: %shl
73 ; XOP: Found an estimated cost of 1 for instruction: %shl
76 ; v16i16 and v8i32 shift left by non-uniform constant are lowered into
77 ; vector multiply instructions. With AVX (but not AVX2), the vector multiply
78 ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
80 ; With AVX2, instruction vpmullw works with 256bit quantities and
81 ; therefore there is no need to split the resulting vector multiply into
82 ; a sequence of two multiply.
84 ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
85 ; the cost computed in the case of 'test1'. That is because the backend
86 ; simply emits 2 pmullw with no extract/insert.
89 define <16 x i16> @test6(<16 x i16> %a) {
90 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
93 ; CHECK: 'Cost Model Analysis' for function 'test6':
94 ; SSE2: Found an estimated cost of 2 for instruction: %shl
95 ; SSE41: Found an estimated cost of 2 for instruction: %shl
96 ; AVX: Found an estimated cost of 4 for instruction: %shl
97 ; AVX2: Found an estimated cost of 1 for instruction: %shl
98 ; XOPAVX: Found an estimated cost of 2 for instruction: %shl
99 ; XOPAVX2: Found an estimated cost of 1 for instruction: %shl
102 ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
103 ; the cost computed in the case of 'test3'. That is because the multiply
104 ; is type-legalized into two 4i32 vector multiply.
106 define <8 x i32> @test7(<8 x i32> %a) {
107 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
110 ; CHECK: 'Cost Model Analysis' for function 'test7':
111 ; SSE2: Found an estimated cost of 12 for instruction: %shl
112 ; SSE41: Found an estimated cost of 2 for instruction: %shl
113 ; AVX: Found an estimated cost of 4 for instruction: %shl
114 ; AVX2: Found an estimated cost of 1 for instruction: %shl
115 ; XOPAVX: Found an estimated cost of 2 for instruction: %shl
116 ; XOPAVX2: Found an estimated cost of 1 for instruction: %shl
119 ; On AVX2 we are able to lower the following shift into a single
120 ; vpsllvq. Therefore, the expected cost is only 1.
121 ; In all other cases, this shift is scalarized as the target does not support
122 ; vpsllv instructions.
124 define <4 x i64> @test8(<4 x i64> %a) {
125 %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
128 ; CHECK: 'Cost Model Analysis' for function 'test8':
129 ; SSE2: Found an estimated cost of 8 for instruction: %shl
130 ; SSE41: Found an estimated cost of 8 for instruction: %shl
131 ; AVX: Found an estimated cost of 8 for instruction: %shl
132 ; AVX2: Found an estimated cost of 1 for instruction: %shl
133 ; XOPAVX: Found an estimated cost of 2 for instruction: %shl
134 ; XOPAVX2: Found an estimated cost of 1 for instruction: %shl
137 ; Same as 'test6', with the difference that the cost is double.
139 define <32 x i16> @test9(<32 x i16> %a) {
140 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
143 ; CHECK: 'Cost Model Analysis' for function 'test9':
144 ; SSE2: Found an estimated cost of 4 for instruction: %shl
145 ; SSE41: Found an estimated cost of 4 for instruction: %shl
146 ; AVX: Found an estimated cost of 8 for instruction: %shl
147 ; AVX2: Found an estimated cost of 2 for instruction: %shl
148 ; XOPAVX: Found an estimated cost of 4 for instruction: %shl
149 ; XOPAVX2: Found an estimated cost of 2 for instruction: %shl
152 ; Same as 'test7', except that now the cost is double.
154 define <16 x i32> @test10(<16 x i32> %a) {
155 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
158 ; CHECK: 'Cost Model Analysis' for function 'test10':
159 ; SSE2: Found an estimated cost of 24 for instruction: %shl
160 ; SSE41: Found an estimated cost of 4 for instruction: %shl
161 ; AVX: Found an estimated cost of 8 for instruction: %shl
162 ; AVX2: Found an estimated cost of 2 for instruction: %shl
163 ; XOPAVX: Found an estimated cost of 4 for instruction: %shl
164 ; XOPAVX2: Found an estimated cost of 2 for instruction: %shl
167 ; On AVX2 we are able to lower the following shift into a sequence of
168 ; two vpsllvq instructions. Therefore, the expected cost is only 2.
169 ; In all other cases, this shift is scalarized as we don't have vpsllv
172 define <8 x i64> @test11(<8 x i64> %a) {
173 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
176 ; CHECK: 'Cost Model Analysis' for function 'test11':
177 ; SSE2: Found an estimated cost of 16 for instruction: %shl
178 ; SSE41: Found an estimated cost of 16 for instruction: %shl
179 ; AVX: Found an estimated cost of 16 for instruction: %shl
180 ; AVX2: Found an estimated cost of 2 for instruction: %shl
181 ; XOPAVX: Found an estimated cost of 4 for instruction: %shl
182 ; XOPAVX2: Found an estimated cost of 2 for instruction: %shl