From 2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Thu, 4 Apr 2013 23:26:24 +0000 Subject: [PATCH] X86 cost model: Differentiate cost for vector shifts of constants SSE2 has efficient support for shifts by a scalar. My previous change of making shifts expensive did not take this into account marking all shifts as expensive. This would prevent vectorization from happening where it is actually beneficial. With this change we differentiate between shifts of constants and other shifts. radar://13576547 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178808 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86TargetTransformInfo.cpp | 29 ++ test/Analysis/CostModel/X86/testshiftashr.ll | 288 +++++++++++++++++++ test/Analysis/CostModel/X86/testshiftlshr.ll | 287 ++++++++++++++++++ test/Analysis/CostModel/X86/testshiftshl.ll | 288 +++++++++++++++++++ 4 files changed, 892 insertions(+) diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index e8940bfc230..a98c6991192 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -206,6 +206,35 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * AVX2CostTable[Idx].Cost; } + static const CostTblEntry SSE2UniformConstCostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // Constant splats are cheaper for the following instructions. + { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasSSE2()) { + int Idx = CostTableLookup(SSE2UniformConstCostTable, + array_lengthof(SSE2UniformConstCostTable), + ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2UniformConstCostTable[Idx].Cost; + } + + static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll index d932b2a4c42..f35eea87164 100644 --- a/test/Analysis/CostModel/X86/testshiftashr.ll +++ b/test/Analysis/CostModel/X86/testshiftashr.ll @@ -241,3 +241,291 @@ entry: ret %shifttype32i8 %0 } +; Test shift by a constant a value. + +%shifttypec = type <2 x i16> +define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { +entry: + ; SSE2: shift2i16const + ; SSE2: cost of 20 {{.*}} ashr + ; SSE2-CODEGEN: shift2i16const + ; SSE2-CODEGEN: sarq $ + + %0 = ashr %shifttypec %a , + ret %shifttypec %0 +} + +%shifttypec4i16 = type <4 x i16> +define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { +entry: + ; SSE2: shift4i16const + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift4i16const + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec4i16 %a , + ret %shifttypec4i16 %0 +} + +%shifttypec8i16 = type <8 x i16> +define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { +entry: + ; SSE2: shift8i16const + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift8i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec8i16 %a , + ret %shifttypec8i16 %0 +} + +%shifttypec16i16 = type <16 x i16> +define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, + %shifttypec16i16 %b) { +entry: + ; SSE2: shift16i16const + ; SSE2: cost of 2 {{.*}} ashr + ; SSE2-CODEGEN: shift16i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec16i16 %a , + ret %shifttypec16i16 %0 +} + +%shifttypec32i16 = type <32 x i16> +define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, + %shifttypec32i16 %b) { +entry: + ; SSE2: shift32i16const + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN: shift32i16const + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec32i16 %a , + ret %shifttypec32i16 %0 +} + +%shifttypec2i32 = type <2 x i32> +define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { +entry: + ; SSE2: shift2i32c + ; SSE2: cost of 20 {{.*}} ashr + ; SSE2-CODEGEN: shift2i32c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec2i32 %a , + ret %shifttypec2i32 %0 +} + +%shifttypec4i32 = type <4 x i32> +define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { +entry: + ; SSE2: shift4i32c + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift4i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec4i32 %a , + ret %shifttypec4i32 %0 +} + +%shifttypec8i32 = type <8 x i32> +define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { +entry: + ; SSE2: shift8i32c + ; SSE2: cost of 2 {{.*}} ashr + ; SSE2-CODEGEN: shift8i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec8i32 %a , + ret %shifttypec8i32 %0 +} + +%shifttypec16i32 = type <16 x i32> +define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { +entry: + ; SSE2: shift16i32c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN: shift16i32c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec16i32 %a , + ret %shifttypec16i32 %0 +} + +%shifttypec32i32 = type <32 x i32> +define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { +entry: + ; SSE2: shift32i32c + ; getTypeConversion fails here and promotes this to a i64. + ; SSE2: cost of 256 {{.*}} ashr + ; SSE2-CODEGEN: shift32i32c + ; SSE2-CODEGEN: psrad $3 + %0 = ashr %shifttypec32i32 %a , + ret %shifttypec32i32 %0 +} + +%shifttypec2i64 = type <2 x i64> +define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { +entry: + ; SSE2: shift2i64c + ; SSE2: cost of 20 {{.*}} ashr + ; SSE2-CODEGEN: shift2i64c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec2i64 %a , + ret %shifttypec2i64 %0 +} + +%shifttypec4i64 = type <4 x i64> +define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { +entry: + ; SSE2: shift4i64c + ; SSE2: cost of 40 {{.*}} ashr + ; SSE2-CODEGEN: shift4i64c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec4i64 %a , + ret %shifttypec4i64 %0 +} + +%shifttypec8i64 = type <8 x i64> +define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { +entry: + ; SSE2: shift8i64c + ; SSE2: cost of 80 {{.*}} ashr + ; SSE2-CODEGEN: shift8i64c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec8i64 %a , + ret %shifttypec8i64 %0 +} + +%shifttypec16i64 = type <16 x i64> +define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { +entry: + ; SSE2: shift16i64c + ; SSE2: cost of 160 {{.*}} ashr + ; SSE2-CODEGEN: shift16i64c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec16i64 %a , + ret %shifttypec16i64 %0 +} + +%shifttypec32i64 = type <32 x i64> +define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { +entry: + ; SSE2: shift32i64c + ; SSE2: cost of 256 {{.*}} ashr + ; SSE2-CODEGEN: shift32i64c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec32i64 %a , + ret %shifttypec32i64 %0 +} + +%shifttypec2i8 = type <2 x i8> +define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { +entry: + ; SSE2: shift2i8c + ; SSE2: cost of 20 {{.*}} ashr + ; SSE2-CODEGEN: shift2i8c + ; SSE2-CODEGEN: sarq $3 + + %0 = ashr %shifttypec2i8 %a , + ret %shifttypec2i8 %0 +} + +%shifttypec4i8 = type <4 x i8> +define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { +entry: + ; SSE2: shift4i8c + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift4i8c + ; SSE2-CODEGEN: psrad $3 + + %0 = ashr %shifttypec4i8 %a , + ret %shifttypec4i8 %0 +} + +%shifttypec8i8 = type <8 x i8> +define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { +entry: + ; SSE2: shift8i8c + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: shift8i8c + ; SSE2-CODEGEN: psraw $3 + + %0 = ashr %shifttypec8i8 %a , + ret %shifttypec8i8 %0 +} + +%shifttypec16i8 = type <16 x i8> +define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { +entry: + ; SSE2: shift16i8c + ; SSE2: cost of 4 {{.*}} ashr + ; SSE2-CODEGEN: shift16i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec16i8 %a , + ret %shifttypec16i8 %0 +} + +%shifttypec32i8 = type <32 x i8> +define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { +entry: + ; SSE2: shift32i8c + ; SSE2: cost of 8 {{.*}} ashr + ; SSE2-CODEGEN: shift32i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = ashr %shifttypec32i8 %a , + ret %shifttypec32i8 %0 +} + diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll index 7d665fc9c61..8d6ef387420 100644 --- a/test/Analysis/CostModel/X86/testshiftlshr.ll +++ b/test/Analysis/CostModel/X86/testshiftlshr.ll @@ -241,3 +241,290 @@ entry: ret %shifttype32i8 %0 } +; Test shift by a constant vector. + +%shifttypec = type <2 x i16> +define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { +entry: + ; SSE2: shift2i16const + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift2i16const + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec %a , + ret %shifttypec %0 +} + +%shifttypec4i16 = type <4 x i16> +define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { +entry: + ; SSE2: shift4i16const + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift4i16const + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec4i16 %a , + ret %shifttypec4i16 %0 +} + +%shifttypec8i16 = type <8 x i16> +define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { +entry: + ; SSE2: shift8i16const + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift8i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec8i16 %a , + ret %shifttypec8i16 %0 +} + +%shifttypec16i16 = type <16 x i16> +define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, + %shifttypec16i16 %b) { +entry: + ; SSE2: shift16i16const + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN: shift16i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec16i16 %a , + ret %shifttypec16i16 %0 +} + +%shifttypec32i16 = type <32 x i16> +define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, + %shifttypec32i16 %b) { +entry: + ; SSE2: shift32i16const + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN: shift32i16const + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec32i16 %a , + ret %shifttypec32i16 %0 +} + +%shifttypec2i32 = type <2 x i32> +define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { +entry: + ; SSE2: shift2i32c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift2i32c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec2i32 %a , + ret %shifttypec2i32 %0 +} + +%shifttypec4i32 = type <4 x i32> +define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { +entry: + ; SSE2: shift4i32c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift4i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec4i32 %a , + ret %shifttypec4i32 %0 +} + +%shifttypec8i32 = type <8 x i32> +define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { +entry: + ; SSE2: shift8i32c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN: shift8i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec8i32 %a , + ret %shifttypec8i32 %0 +} + +%shifttypec16i32 = type <16 x i32> +define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { +entry: + ; SSE2: shift16i32c + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN: shift16i32c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec16i32 %a , + ret %shifttypec16i32 %0 +} + +%shifttypec32i32 = type <32 x i32> +define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { +entry: + ; SSE2: shift32i32c + ; getTypeConversion fails here and promotes this to a i64. + ; SSE2: cost of 256 {{.*}} lshr + ; SSE2-CODEGEN: shift32i32c + ; SSE2-CODEGEN: psrld $3 + %0 = lshr %shifttypec32i32 %a , + ret %shifttypec32i32 %0 +} + +%shifttypec2i64 = type <2 x i64> +define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { +entry: + ; SSE2: shift2i64c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift2i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec2i64 %a , + ret %shifttypec2i64 %0 +} + +%shifttypec4i64 = type <4 x i64> +define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { +entry: + ; SSE2: shift4i64c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN: shift4i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec4i64 %a , + ret %shifttypec4i64 %0 +} + +%shifttypec8i64 = type <8 x i64> +define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { +entry: + ; SSE2: shift8i64c + ; SSE2: cost of 4 {{.*}} lshr + ; SSE2-CODEGEN: shift8i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec8i64 %a , + ret %shifttypec8i64 %0 +} + +%shifttypec16i64 = type <16 x i64> +define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { +entry: + ; SSE2: shift16i64c + ; SSE2: cost of 8 {{.*}} lshr + ; SSE2-CODEGEN: shift16i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec16i64 %a , + ret %shifttypec16i64 %0 +} + +%shifttypec32i64 = type <32 x i64> +define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { +entry: + ; SSE2: shift32i64c + ; SSE2: cost of 256 {{.*}} lshr + ; SSE2-CODEGEN: shift32i64c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec32i64 %a , + ret %shifttypec32i64 %0 +} + +%shifttypec2i8 = type <2 x i8> +define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { +entry: + ; SSE2: shift2i8c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift2i8c + ; SSE2-CODEGEN: psrlq $3 + + %0 = lshr %shifttypec2i8 %a , + ret %shifttypec2i8 %0 +} + +%shifttypec4i8 = type <4 x i8> +define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { +entry: + ; SSE2: shift4i8c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift4i8c + ; SSE2-CODEGEN: psrld $3 + + %0 = lshr %shifttypec4i8 %a , + ret %shifttypec4i8 %0 +} + +%shifttypec8i8 = type <8 x i8> +define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { +entry: + ; SSE2: shift8i8c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift8i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec8i8 %a , + ret %shifttypec8i8 %0 +} + +%shifttypec16i8 = type <16 x i8> +define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { +entry: + ; SSE2: shift16i8c + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: shift16i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec16i8 %a , + ret %shifttypec16i8 %0 +} + +%shifttypec32i8 = type <32 x i8> +define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { +entry: + ; SSE2: shift32i8c + ; SSE2: cost of 2 {{.*}} lshr + ; SSE2-CODEGEN: shift32i8c + ; SSE2-CODEGEN: psrlw $3 + + %0 = lshr %shifttypec32i8 %a , + ret %shifttypec32i8 %0 +} diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll index 897d9832e50..f45a6987921 100644 --- a/test/Analysis/CostModel/X86/testshiftshl.ll +++ b/test/Analysis/CostModel/X86/testshiftshl.ll @@ -240,3 +240,291 @@ entry: %0 = shl %shifttype32i8 %a , %b ret %shifttype32i8 %0 } + +; Test shift by a constant vector. + +%shifttypec = type <2 x i16> +define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) { +entry: + ; SSE2: shift2i16const + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift2i16const + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec %a , + ret %shifttypec %0 +} + +%shifttypec4i16 = type <4 x i16> +define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) { +entry: + ; SSE2: shift4i16const + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift4i16const + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec4i16 %a , + ret %shifttypec4i16 %0 +} + +%shifttypec8i16 = type <8 x i16> +define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) { +entry: + ; SSE2: shift8i16const + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift8i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec8i16 %a , + ret %shifttypec8i16 %0 +} + +%shifttypec16i16 = type <16 x i16> +define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a, + %shifttypec16i16 %b) { +entry: + ; SSE2: shift16i16const + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN: shift16i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec16i16 %a , + ret %shifttypec16i16 %0 +} + +%shifttypec32i16 = type <32 x i16> +define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a, + %shifttypec32i16 %b) { +entry: + ; SSE2: shift32i16const + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN: shift32i16const + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec32i16 %a , + ret %shifttypec32i16 %0 +} + +%shifttypec2i32 = type <2 x i32> +define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) { +entry: + ; SSE2: shift2i32c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift2i32c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec2i32 %a , + ret %shifttypec2i32 %0 +} + +%shifttypec4i32 = type <4 x i32> +define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) { +entry: + ; SSE2: shift4i32c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift4i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec4i32 %a , + ret %shifttypec4i32 %0 +} + +%shifttypec8i32 = type <8 x i32> +define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) { +entry: + ; SSE2: shift8i32c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN: shift8i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec8i32 %a , + ret %shifttypec8i32 %0 +} + +%shifttypec16i32 = type <16 x i32> +define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) { +entry: + ; SSE2: shift16i32c + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN: shift16i32c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec16i32 %a , + ret %shifttypec16i32 %0 +} + +%shifttypec32i32 = type <32 x i32> +define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) { +entry: + ; SSE2: shift32i32c + ; getTypeConversion fails here and promotes this to a i64. + ; SSE2: cost of 256 {{.*}} shl + ; SSE2-CODEGEN: shift32i32c + ; SSE2-CODEGEN: pslld $3 + %0 = shl %shifttypec32i32 %a , + ret %shifttypec32i32 %0 +} + +%shifttypec2i64 = type <2 x i64> +define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) { +entry: + ; SSE2: shift2i64c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift2i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec2i64 %a , + ret %shifttypec2i64 %0 +} + +%shifttypec4i64 = type <4 x i64> +define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) { +entry: + ; SSE2: shift4i64c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN: shift4i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec4i64 %a , + ret %shifttypec4i64 %0 +} + +%shifttypec8i64 = type <8 x i64> +define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) { +entry: + ; SSE2: shift8i64c + ; SSE2: cost of 4 {{.*}} shl + ; SSE2-CODEGEN: shift8i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec8i64 %a , + ret %shifttypec8i64 %0 +} + +%shifttypec16i64 = type <16 x i64> +define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) { +entry: + ; SSE2: shift16i64c + ; SSE2: cost of 8 {{.*}} shl + ; SSE2-CODEGEN: shift16i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec16i64 %a , + ret %shifttypec16i64 %0 +} + +%shifttypec32i64 = type <32 x i64> +define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) { +entry: + ; SSE2: shift32i64c + ; SSE2: cost of 256 {{.*}} shl + ; SSE2-CODEGEN: shift32i64c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec32i64 %a , + ret %shifttypec32i64 %0 +} + +%shifttypec2i8 = type <2 x i8> +define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) { +entry: + ; SSE2: shift2i8c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift2i8c + ; SSE2-CODEGEN: psllq $3 + + %0 = shl %shifttypec2i8 %a , + ret %shifttypec2i8 %0 +} + +%shifttypec4i8 = type <4 x i8> +define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) { +entry: + ; SSE2: shift4i8c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift4i8c + ; SSE2-CODEGEN: pslld $3 + + %0 = shl %shifttypec4i8 %a , + ret %shifttypec4i8 %0 +} + +%shifttypec8i8 = type <8 x i8> +define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) { +entry: + ; SSE2: shift8i8c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift8i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec8i8 %a , + ret %shifttypec8i8 %0 +} + +%shifttypec16i8 = type <16 x i8> +define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) { +entry: + ; SSE2: shift16i8c + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: shift16i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec16i8 %a , + ret %shifttypec16i8 %0 +} + +%shifttypec32i8 = type <32 x i8> +define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) { +entry: + ; SSE2: shift32i8c + ; SSE2: cost of 2 {{.*}} shl + ; SSE2-CODEGEN: shift32i8c + ; SSE2-CODEGEN: psllw $3 + + %0 = shl %shifttypec32i8 %a , + ret %shifttypec32i8 %0 +} -- 2.34.1