1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
7 ; Verify the cost of vector shift left instructions.
9 ; We always emit a single pmullw in the case of v8i16 vector shifts by
10 ; non-uniform constant.
12 define <8 x i16> @test1(<8 x i16> %a) {
13 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
16 ; CHECK: 'Cost Model Analysis' for function 'test1':
17 ; CHECK: Found an estimated cost of 1 for instruction: %shl
20 define <8 x i16> @test2(<8 x i16> %a) {
21 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
24 ; CHECK: 'Cost Model Analysis' for function 'test2':
25 ; CHECK: Found an estimated cost of 1 for instruction: %shl
28 ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
29 ; Make sure that the estimated cost is always 1 except for the case where
30 ; we only have SSE2 support. With SSE2, we are forced to special lower the
31 ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
33 define <4 x i32> @test3(<4 x i32> %a) {
34 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
37 ; CHECK: 'Cost Model Analysis' for function 'test3':
38 ; SSE2: Found an estimated cost of 6 for instruction: %shl
39 ; SSE41: Found an estimated cost of 1 for instruction: %shl
40 ; AVX: Found an estimated cost of 1 for instruction: %shl
41 ; AVX2: Found an estimated cost of 1 for instruction: %shl
44 define <4 x i32> @test4(<4 x i32> %a) {
45 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
48 ; CHECK: 'Cost Model Analysis' for function 'test4':
49 ; SSE2: Found an estimated cost of 6 for instruction: %shl
50 ; SSE41: Found an estimated cost of 1 for instruction: %shl
51 ; AVX: Found an estimated cost of 1 for instruction: %shl
52 ; AVX2: Found an estimated cost of 1 for instruction: %shl
55 ; On AVX2 we are able to lower the following shift into a single
56 ; vpsllvq. Therefore, the expected cost is only 1.
57 ; In all other cases, this shift is scalarized as the target does not support
58 ; vpsllv instructions.
60 define <2 x i64> @test5(<2 x i64> %a) {
61 %shl = shl <2 x i64> %a, <i64 2, i64 3>
64 ; CHECK: 'Cost Model Analysis' for function 'test5':
65 ; SSE2: Found an estimated cost of 4 for instruction: %shl
66 ; SSE41: Found an estimated cost of 4 for instruction: %shl
67 ; AVX: Found an estimated cost of 4 for instruction: %shl
68 ; AVX2: Found an estimated cost of 1 for instruction: %shl
71 ; v16i16 and v8i32 shift left by non-uniform constant are lowered into
72 ; vector multiply instructions. With AVX (but not AVX2), the vector multiply
73 ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
75 ; With AVX2, instruction vpmullw works with 256bit quantities and
76 ; therefore there is no need to split the resulting vector multiply into
77 ; a sequence of two multiply.
79 ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
80 ; the cost computed in the case of 'test1'. That is because the backend
81 ; simply emits 2 pmullw with no extract/insert.
84 define <16 x i16> @test6(<16 x i16> %a) {
85 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
88 ; CHECK: 'Cost Model Analysis' for function 'test6':
89 ; SSE2: Found an estimated cost of 2 for instruction: %shl
90 ; SSE41: Found an estimated cost of 2 for instruction: %shl
91 ; AVX: Found an estimated cost of 4 for instruction: %shl
92 ; AVX2: Found an estimated cost of 1 for instruction: %shl
95 ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
96 ; the cost computed in the case of 'test3'. That is because the multiply
97 ; is type-legalized into two 4i32 vector multiply.
99 define <8 x i32> @test7(<8 x i32> %a) {
100 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
103 ; CHECK: 'Cost Model Analysis' for function 'test7':
104 ; SSE2: Found an estimated cost of 12 for instruction: %shl
105 ; SSE41: Found an estimated cost of 2 for instruction: %shl
106 ; AVX: Found an estimated cost of 4 for instruction: %shl
107 ; AVX2: Found an estimated cost of 1 for instruction: %shl
110 ; On AVX2 we are able to lower the following shift into a single
111 ; vpsllvq. Therefore, the expected cost is only 1.
112 ; In all other cases, this shift is scalarized as the target does not support
113 ; vpsllv instructions.
115 define <4 x i64> @test8(<4 x i64> %a) {
116 %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
119 ; CHECK: 'Cost Model Analysis' for function 'test8':
120 ; SSE2: Found an estimated cost of 8 for instruction: %shl
121 ; SSE41: Found an estimated cost of 8 for instruction: %shl
122 ; AVX: Found an estimated cost of 8 for instruction: %shl
123 ; AVX2: Found an estimated cost of 1 for instruction: %shl
126 ; Same as 'test6', with the difference that the cost is double.
128 define <32 x i16> @test9(<32 x i16> %a) {
129 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
132 ; CHECK: 'Cost Model Analysis' for function 'test9':
133 ; SSE2: Found an estimated cost of 4 for instruction: %shl
134 ; SSE41: Found an estimated cost of 4 for instruction: %shl
135 ; AVX: Found an estimated cost of 8 for instruction: %shl
136 ; AVX2: Found an estimated cost of 2 for instruction: %shl
139 ; Same as 'test7', except that now the cost is double.
141 define <16 x i32> @test10(<16 x i32> %a) {
142 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
145 ; CHECK: 'Cost Model Analysis' for function 'test10':
146 ; SSE2: Found an estimated cost of 24 for instruction: %shl
147 ; SSE41: Found an estimated cost of 4 for instruction: %shl
148 ; AVX: Found an estimated cost of 8 for instruction: %shl
149 ; AVX2: Found an estimated cost of 2 for instruction: %shl
152 ; On AVX2 we are able to lower the following shift into a sequence of
153 ; two vpsllvq instructions. Therefore, the expected cost is only 2.
154 ; In all other cases, this shift is scalarized as we don't have vpsllv
157 define <8 x i64> @test11(<8 x i64> %a) {
158 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
161 ; CHECK: 'Cost Model Analysis' for function 'test11':
162 ; SSE2: Found an estimated cost of 16 for instruction: %shl
163 ; SSE41: Found an estimated cost of 16 for instruction: %shl
164 ; AVX: Found an estimated cost of 16 for instruction: %shl
165 ; AVX2: Found an estimated cost of 2 for instruction: %shl