test/Analysis/CostModel/X86/vshift-cost.ll

   1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
   2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
   3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
   4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
   5
   6
   7 ; Verify the cost of vector shift left instructions.
   8
   9 ; We always emit a single pmullw in the case of v8i16 vector shifts by
  10 ; non-uniform constant.
  11
  12 define <8 x i16> @test1(<8 x i16> %a) {
  13   %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  14   ret <8 x i16> %shl
  15 }
  16 ; CHECK: 'Cost Model Analysis' for function 'test1':
  17 ; CHECK: Found an estimated cost of 1 for instruction:   %shl
  18
  19
  20 define <8 x i16> @test2(<8 x i16> %a) {
  21   %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
  22   ret <8 x i16> %shl
  23 }
  24 ; CHECK: 'Cost Model Analysis' for function 'test2':
  25 ; CHECK: Found an estimated cost of 1 for instruction:   %shl
  26
  27
  28 ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
  29 ; Make sure that the estimated cost is always 1 except for the case where
  30 ; we only have SSE2 support. With SSE2, we are forced to special lower the
  31 ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
  32
  33 define <4 x i32> @test3(<4 x i32> %a) {
  34   %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
  35   ret <4 x i32> %shl
  36 }
  37 ; CHECK: 'Cost Model Analysis' for function 'test3':
  38 ; SSE2: Found an estimated cost of 6 for instruction:   %shl
  39 ; SSE41: Found an estimated cost of 1 for instruction:   %shl
  40 ; AVX: Found an estimated cost of 1 for instruction:   %shl
  41 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  42
  43
  44 define <4 x i32> @test4(<4 x i32> %a) {
  45   %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
  46   ret <4 x i32> %shl
  47 }
  48 ; CHECK: 'Cost Model Analysis' for function 'test4':
  49 ; SSE2: Found an estimated cost of 6 for instruction:   %shl
  50 ; SSE41: Found an estimated cost of 1 for instruction:   %shl
  51 ; AVX: Found an estimated cost of 1 for instruction:   %shl
  52 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  53
  54
  55 ; On AVX2 we are able to lower the following shift into a single
  56 ; vpsllvq. Therefore, the expected cost is only 1.
  57 ; In all other cases, this shift is scalarized as the target does not support
  58 ; vpsllv instructions.
  59
  60 define <2 x i64> @test5(<2 x i64> %a) {
  61   %shl = shl <2 x i64> %a, <i64 2, i64 3>
  62   ret <2 x i64> %shl
  63 }
  64 ; CHECK: 'Cost Model Analysis' for function 'test5':
  65 ; SSE2: Found an estimated cost of 4 for instruction:   %shl
  66 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
  67 ; AVX: Found an estimated cost of 4 for instruction:   %shl
  68 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  69
  70
  71 ; v16i16 and v8i32 shift left by non-uniform constant are lowered into
  72 ; vector multiply instructions.  With AVX (but not AVX2), the vector multiply
  73 ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
  74 ;
  75 ; With AVX2, instruction vpmullw works with 256bit quantities and
  76 ; therefore there is no need to split the resulting vector multiply into
  77 ; a sequence of two multiply.
  78 ;
  79 ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
  80 ; the cost computed in the case of 'test1'. That is because the backend
  81 ; simply emits 2 pmullw with no extract/insert.
  82
  83
  84 define <16 x i16> @test6(<16 x i16> %a) {
  85   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  86   ret <16 x i16> %shl
  87 }
  88 ; CHECK: 'Cost Model Analysis' for function 'test6':
  89 ; SSE2: Found an estimated cost of 2 for instruction:   %shl
  90 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
  91 ; AVX: Found an estimated cost of 4 for instruction:   %shl
  92 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
  93
  94
  95 ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
  96 ; the cost computed in the case of 'test3'. That is because the multiply
  97 ; is type-legalized into two 4i32 vector multiply.
  98
  99 define <8 x i32> @test7(<8 x i32> %a) {
 100   %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
 101   ret <8 x i32> %shl
 102 }
 103 ; CHECK: 'Cost Model Analysis' for function 'test7':
 104 ; SSE2: Found an estimated cost of 12 for instruction:   %shl
 105 ; SSE41: Found an estimated cost of 2 for instruction:   %shl
 106 ; AVX: Found an estimated cost of 4 for instruction:   %shl
 107 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
 108
 109
 110 ; On AVX2 we are able to lower the following shift into a single
 111 ; vpsllvq. Therefore, the expected cost is only 1.
 112 ; In all other cases, this shift is scalarized as the target does not support
 113 ; vpsllv instructions.
 114
 115 define <4 x i64> @test8(<4 x i64> %a) {
 116   %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
 117   ret <4 x i64> %shl
 118 }
 119 ; CHECK: 'Cost Model Analysis' for function 'test8':
 120 ; SSE2: Found an estimated cost of 8 for instruction:   %shl
 121 ; SSE41: Found an estimated cost of 8 for instruction:   %shl
 122 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 123 ; AVX2: Found an estimated cost of 1 for instruction:   %shl
 124
 125
 126 ; Same as 'test6', with the difference that the cost is double.
 127
 128 define <32 x i16> @test9(<32 x i16> %a) {
 129   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
 130   ret <32 x i16> %shl
 131 }
 132 ; CHECK: 'Cost Model Analysis' for function 'test9':
 133 ; SSE2: Found an estimated cost of 4 for instruction:   %shl
 134 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
 135 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 136 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
 137
 138
 139 ; Same as 'test7', except that now the cost is double.
 140
 141 define <16 x i32> @test10(<16 x i32> %a) {
 142   %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
 143   ret <16 x i32> %shl
 144 }
 145 ; CHECK: 'Cost Model Analysis' for function 'test10':
 146 ; SSE2: Found an estimated cost of 24 for instruction:   %shl
 147 ; SSE41: Found an estimated cost of 4 for instruction:   %shl
 148 ; AVX: Found an estimated cost of 8 for instruction:   %shl
 149 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
 150
 151
 152 ; On AVX2 we are able to lower the following shift into a sequence of
 153 ; two vpsllvq instructions. Therefore, the expected cost is only 2.
 154 ; In all other cases, this shift is scalarized as we don't have vpsllv
 155 ; instructions.
 156
 157 define <8 x i64> @test11(<8 x i64> %a) {
 158   %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
 159   ret <8 x i64> %shl
 160 }
 161 ; CHECK: 'Cost Model Analysis' for function 'test11':
 162 ; SSE2: Found an estimated cost of 16 for instruction:   %shl
 163 ; SSE41: Found an estimated cost of 16 for instruction:   %shl
 164 ; AVX: Found an estimated cost of 16 for instruction:   %shl
 165 ; AVX2: Found an estimated cost of 2 for instruction:   %shl
 166
 167