test/Analysis/CostModel/X86/vshift-cost.ll

   1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
   2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
   3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
   4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
   5 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
   6 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
   7
   8
   9 ; Verify the cost of vector shift left instructions.
  10
  11 ; We always emit a single pmullw in the case of v8i16 vector shifts by
  12 ; non-uniform constant.
  13
  14 define <8 x i16> @test1(<8 x i16> %a) {
  15   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  16   ret <8 x i16> %shl
  17 }
  18 ; CHECK: 'Cost Model Analysis' for function 'test1':
  19 ; CHECK: Found an estimated cost of 1 for instruction:   %shl
  20
  21
  22 define <8 x i16> @test2(<8 x i16> %a) {
  23   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
  24   ret <8 x i16> %shl
  25 }
  26 ; CHECK: 'Cost Model Analysis' for function 'test2':
  27 ; CHECK: Found an estimated cost of 1 for instruction:   %shl
  28
  29
  30 ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
  31 ; Make sure that the estimated cost is always 1 except for the case where
  32 ; we only have SSE2 support. With SSE2, we are forced to special lower the
  33 ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
  34
  35 define <4 x i32> @test3(<4 x i32> %a) {
  36   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
  37   ret <4 x i32> %shl
  38 }
  39 ; CHECK: 'Cost Model Analysis' for function 'test3':
  40 ; SSE2: Found an estimated cost of 6 for instruction:   %shl
  41 ; SSE41: Found an estimated cost of 1 for instruction:   %shl
  42 ; AVX: Found an estimated cost of 1 for instruction:   %shl
  43 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  44 ; XOP: Found an estimated cost of 1 for instruction:   %shl
  45
  46
  47 define <4 x i32> @test4(<4 x i32> %a) {
  48   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
  49   ret <4 x i32> %shl
  50 }
  51 ; CHECK: 'Cost Model Analysis' for function 'test4':
  52 ; SSE2: Found an estimated cost of 6 for instruction:   %shl
  53 ; SSE41: Found an estimated cost of 1 for instruction:   %shl
  54 ; AVX: Found an estimated cost of 1 for instruction:   %shl
  55 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  56 ; XOP: Found an estimated cost of 1 for instruction:   %shl
  57
  58
  59 ; On AVX2 we are able to lower the following shift into a single
  60 ; vpsllvq. Therefore, the expected cost is only 1.
  61 ; In all other cases, this shift is scalarized as the target does not support
  62 ; vpsllv instructions.
  63
  64 define <2 x i64> @test5(<2 x i64> %a) {
  65   %shl = shl <2 x i64> %a, <i64 2, i64 3>
  66   ret <2 x i64> %shl
  67 }
  68 ; CHECK: 'Cost Model Analysis' for function 'test5':
  69 ; SSE2: Found an estimated cost of 4 for instruction:   %shl
  70 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
  71 ; AVX: Found an estimated cost of 4 for instruction:   %shl
  72 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  73 ; XOP: Found an estimated cost of 1 for instruction:   %shl
  74
  75
  76 ; v16i16 and v8i32 shift left by non-uniform constant are lowered into
  77 ; vector multiply instructions.  With AVX (but not AVX2), the vector multiply
  78 ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
  79 ;
  80 ; With AVX2, instruction vpmullw works with 256bit quantities and
  81 ; therefore there is no need to split the resulting vector multiply into
  82 ; a sequence of two multiply.
  83 ;
  84 ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
  85 ; the cost computed in the case of 'test1'. That is because the backend
  86 ; simply emits 2 pmullw with no extract/insert.
  87
  88
  89 define <16 x i16> @test6(<16 x i16> %a) {
  90   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  91   ret <16 x i16> %shl
  92 }
  93 ; CHECK: 'Cost Model Analysis' for function 'test6':
  94 ; SSE2: Found an estimated cost of 2 for instruction:   %shl
  95 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
  96 ; AVX: Found an estimated cost of 4 for instruction:   %shl
  97 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  98 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
  99 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
 100
 101
 102 ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
 103 ; the cost computed in the case of 'test3'. That is because the multiply
 104 ; is type-legalized into two 4i32 vector multiply.
 105
 106 define <8 x i32> @test7(<8 x i32> %a) {
 107   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
 108   ret <8 x i32> %shl
 109 }
 110 ; CHECK: 'Cost Model Analysis' for function 'test7':
 111 ; SSE2: Found an estimated cost of 12 for instruction:   %shl
 112 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
 113 ; AVX: Found an estimated cost of 4 for instruction:   %shl
 114 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
 115 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
 116 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
 117
 118
 119 ; On AVX2 we are able to lower the following shift into a single
 120 ; vpsllvq. Therefore, the expected cost is only 1.
 121 ; In all other cases, this shift is scalarized as the target does not support
 122 ; vpsllv instructions.
 123
 124 define <4 x i64> @test8(<4 x i64> %a) {
 125   %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
 126   ret <4 x i64> %shl
 127 }
 128 ; CHECK: 'Cost Model Analysis' for function 'test8':
 129 ; SSE2: Found an estimated cost of 8 for instruction:   %shl
 130 ; SSE41: Found an estimated cost of 8 for instruction:   %shl
 131 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 132 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
 133 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shl
 134 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shl
 135
 136
 137 ; Same as 'test6', with the difference that the cost is double.
 138
 139 define <32 x i16> @test9(<32 x i16> %a) {
 140   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
 141   ret <32 x i16> %shl
 142 }
 143 ; CHECK: 'Cost Model Analysis' for function 'test9':
 144 ; SSE2: Found an estimated cost of 4 for instruction:   %shl
 145 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
 146 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 147 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
 148 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
 149 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
 150
 151
 152 ; Same as 'test7', except that now the cost is double.
 153
 154 define <16 x i32> @test10(<16 x i32> %a) {
 155   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
 156   ret <16 x i32> %shl
 157 }
 158 ; CHECK: 'Cost Model Analysis' for function 'test10':
 159 ; SSE2: Found an estimated cost of 24 for instruction:   %shl
 160 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
 161 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 162 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
 163 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
 164 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl
 165
 166
 167 ; On AVX2 we are able to lower the following shift into a sequence of
 168 ; two vpsllvq instructions. Therefore, the expected cost is only 2.
 169 ; In all other cases, this shift is scalarized as we don't have vpsllv
 170 ; instructions.
 171
 172 define <8 x i64> @test11(<8 x i64> %a) {
 173   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
 174   ret <8 x i64> %shl
 175 }
 176 ; CHECK: 'Cost Model Analysis' for function 'test11':
 177 ; SSE2: Found an estimated cost of 16 for instruction:   %shl
 178 ; SSE41: Found an estimated cost of 16 for instruction:   %shl
 179 ; AVX: Found an estimated cost of 16 for instruction:   %shl
 180 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
 181 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shl
 182 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shl