From: Simon Pilgrim Date: Fri, 21 Aug 2015 21:09:51 +0000 (+0000) Subject: Line endings fix. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;ds=sidebyside;h=6f86faedf0b38e6ce3b691c247acdbd6334ee161;p=oota-llvm.git Line endings fix. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245736 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/x86-vector-shifts.ll index 26581e0560b..17fabc4cdc1 100644 --- a/test/Transforms/InstCombine/x86-vector-shifts.ll +++ b/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -1,1274 +1,1274 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; -; ASHR - Immediate -; - -define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrai_w_0 -; CHECK-NEXT: ret <8 x i16> %v - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrai_w_15 -; CHECK-NEXT: %1 = ashr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrai_w_64 -; CHECK-NEXT: %1 = ashr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64) - ret <8 x i16> %1 -} - -define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrai_d_0 -; CHECK-NEXT: ret <4 x i32> %v - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrai_d_15 -; CHECK-NEXT: %1 = ashr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrai_d_64 -; CHECK-NEXT: %1 = ashr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64) - ret <4 x i32> %1 -} - -define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrai_w_0 -; CHECK-NEXT: ret <16 x i16> %v - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrai_w_15 -; CHECK-NEXT: %1 = ashr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrai_w_64 -; CHECK-NEXT: %1 = ashr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64) - ret <16 x i16> %1 -} - -define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrai_d_0 -; CHECK-NEXT: ret <8 x i32> %v - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrai_d_15 -; CHECK-NEXT: %1 = ashr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrai_d_64 -; CHECK-NEXT: %1 = ashr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64) - ret <8 x i32> %1 -} - -; -; LSHR - Immediate -; - -define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_w_0 -; CHECK-NEXT: ret <8 x i16> %v - %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrli_w_15(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_w_15 -; CHECK-NEXT: %1 = lshr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 15) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrli_w_64(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_w_64 -; CHECK-NEXT: ret <8 x i16> zeroinitializer - %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 64) - ret <8 x i16> %1 -} - -define <4 x i32> @sse2_psrli_d_0(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_d_0 -; CHECK-NEXT: ret <4 x i32> %v - %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 0) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrli_d_15(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_d_15 -; CHECK-NEXT: %1 = lshr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 15) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrli_d_64(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_d_64 -; CHECK-NEXT: ret <4 x i32> zeroinitializer - %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 64) - ret <4 x i32> %1 -} - -define <2 x i64> @sse2_psrli_q_0(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_q_0 -; CHECK-NEXT: ret <2 x i64> %v - %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 0) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_psrli_q_15(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_q_15 -; CHECK-NEXT: %1 = lshr <2 x i64> %v, -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 15) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_psrli_q_64(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrli_q_64 -; CHECK-NEXT: ret <2 x i64> zeroinitializer - %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 64) - ret <2 x i64> %1 -} - -define <16 x i16> @avx2_psrli_w_0(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_w_0 -; CHECK-NEXT: ret <16 x i16> %v - %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 0) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrli_w_15(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_w_15 -; CHECK-NEXT: %1 = lshr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 15) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrli_w_64(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_w_64 -; CHECK-NEXT: ret <16 x i16> zeroinitializer - %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 64) - ret <16 x i16> %1 -} - -define <8 x i32> @avx2_psrli_d_0(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_d_0 -; CHECK-NEXT: ret <8 x i32> %v - %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 0) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrli_d_15(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_d_15 -; CHECK-NEXT: %1 = lshr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 15) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrli_d_64(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_d_64 -; CHECK-NEXT: ret <8 x i32> zeroinitializer - %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 64) - ret <8 x i32> %1 -} - -define <4 x i64> @avx2_psrli_q_0(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_q_0 -; CHECK-NEXT: ret <4 x i64> %v - %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 0) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_psrli_q_15(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_q_15 -; CHECK-NEXT: %1 = lshr <4 x i64> %v, -; CHECK-NEXT: ret <4 x i64> %1 - %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 15) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_psrli_q_64(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrli_q_64 -; CHECK-NEXT: ret <4 x i64> zeroinitializer - %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 64) - ret <4 x i64> %1 -} - -; -; SHL - Immediate -; - -define <8 x i16> @sse2_pslli_w_0(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_w_0 -; CHECK-NEXT: ret <8 x i16> %v - %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 0) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_pslli_w_15(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_w_15 -; CHECK-NEXT: %1 = shl <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 15) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_pslli_w_64(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_w_64 -; CHECK-NEXT: ret <8 x i16> zeroinitializer - %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 64) - ret <8 x i16> %1 -} - -define <4 x i32> @sse2_pslli_d_0(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_d_0 -; CHECK-NEXT: ret <4 x i32> %v - %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 0) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_pslli_d_15(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_d_15 -; CHECK-NEXT: %1 = shl <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 15) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_pslli_d_64(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_d_64 -; CHECK-NEXT: ret <4 x i32> zeroinitializer - %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 64) - ret <4 x i32> %1 -} - -define <2 x i64> @sse2_pslli_q_0(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_q_0 -; CHECK-NEXT: ret <2 x i64> %v - %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 0) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_pslli_q_15(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_q_15 -; CHECK-NEXT: %1 = shl <2 x i64> %v, -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 15) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_pslli_q_64(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_pslli_q_64 -; CHECK-NEXT: ret <2 x i64> zeroinitializer - %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 64) - ret <2 x i64> %1 -} - -define <16 x i16> @avx2_pslli_w_0(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_w_0 -; CHECK-NEXT: ret <16 x i16> %v - %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 0) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_pslli_w_15(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_w_15 -; CHECK-NEXT: %1 = shl <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 15) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_pslli_w_64(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_w_64 -; CHECK-NEXT: ret <16 x i16> zeroinitializer - %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 64) - ret <16 x i16> %1 -} - -define <8 x i32> @avx2_pslli_d_0(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_d_0 -; CHECK-NEXT: ret <8 x i32> %v - %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 0) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_pslli_d_15(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_d_15 -; CHECK-NEXT: %1 = shl <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 15) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_pslli_d_64(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_d_64 -; CHECK-NEXT: ret <8 x i32> zeroinitializer - %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 64) - ret <8 x i32> %1 -} - -define <4 x i64> @avx2_pslli_q_0(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_q_0 -; CHECK-NEXT: ret <4 x i64> %v - %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 0) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_pslli_q_15(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_q_15 -; CHECK-NEXT: %1 = shl <4 x i64> %v, -; CHECK-NEXT: ret <4 x i64> %1 - %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 15) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_pslli_q_64 -; CHECK-NEXT: ret <4 x i64> zeroinitializer - %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64) - ret <4 x i64> %1 -} - -; -; ASHR - Constant Vector -; - -define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_w_0 -; CHECK-NEXT: ret <8 x i16> %v - %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_w_15 -; CHECK-NEXT: %1 = ashr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_w_15_splat -; CHECK-NEXT: %1 = ashr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_w_64 -; CHECK-NEXT: %1 = ashr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_d_0 -; CHECK-NEXT: ret <4 x i32> %v - %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_d_15 -; CHECK-NEXT: %1 = ashr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_d_15_splat -; CHECK-NEXT: %1 = ashr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_d_64 -; CHECK-NEXT: %1 = ashr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_w_0 -; CHECK-NEXT: ret <16 x i16> %v - %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_w_15 -; CHECK-NEXT: %1 = ashr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_w_15_splat -; CHECK-NEXT: %1 = ashr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_w_64 -; CHECK-NEXT: %1 = ashr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_d_0 -; CHECK-NEXT: ret <8 x i32> %v - %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_d_15 -; CHECK-NEXT: %1 = ashr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_d_15_splat -; CHECK-NEXT: %1 = ashr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_d_64 -; CHECK-NEXT: %1 = ashr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -; -; LSHR - Constant Vector -; - -define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_w_0 -; CHECK-NEXT: ret <8 x i16> %v - %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrl_w_15(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_w_15 -; CHECK-NEXT: %1 = lshr <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrl_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_w_15_splat -; CHECK-NEXT: ret <8 x i16> zeroinitializer - %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psrl_w_64(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_w_64 -; CHECK-NEXT: ret <8 x i16> zeroinitializer - %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <4 x i32> @sse2_psrl_d_0(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_d_0 -; CHECK-NEXT: ret <4 x i32> %v - %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> zeroinitializer) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrl_d_15(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_d_15 -; CHECK-NEXT: %1 = lshr <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrl_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_d_15_splat -; CHECK-NEXT: ret <4 x i32> zeroinitializer - %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psrl_d_64(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_d_64 -; CHECK-NEXT: ret <4 x i32> zeroinitializer - %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <2 x i64> @sse2_psrl_q_0(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_q_0 -; CHECK-NEXT: ret <2 x i64> %v - %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> zeroinitializer) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_psrl_q_15(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_q_15 -; CHECK-NEXT: %1 = lshr <2 x i64> %v, -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> ) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_psrl_q_64(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_q_64 -; CHECK-NEXT: ret <2 x i64> zeroinitializer - %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> ) - ret <2 x i64> %1 -} - -define <16 x i16> @avx2_psrl_w_0(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_w_0 -; CHECK-NEXT: ret <16 x i16> %v - %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> zeroinitializer) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrl_w_15(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_w_15 -; CHECK-NEXT: %1 = lshr <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrl_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_w_15_splat -; CHECK-NEXT: ret <16 x i16> zeroinitializer - %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psrl_w_64(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_w_64 -; CHECK-NEXT: ret <16 x i16> zeroinitializer - %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <8 x i32> @avx2_psrl_d_0(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_d_0 -; CHECK-NEXT: ret <8 x i32> %v - %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> zeroinitializer) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrl_d_15(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_d_15 -; CHECK-NEXT: %1 = lshr <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrl_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_d_15_splat -; CHECK-NEXT: ret <8 x i32> zeroinitializer - %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psrl_d_64(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_d_64 -; CHECK-NEXT: ret <8 x i32> zeroinitializer - %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <4 x i64> @avx2_psrl_q_0(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_q_0 -; CHECK-NEXT: ret <4 x i64> %v - %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> zeroinitializer) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_psrl_q_15(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_q_15 -; CHECK-NEXT: %1 = lshr <4 x i64> %v, -; CHECK-NEXT: ret <4 x i64> %1 - %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_psrl_q_64(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_q_64 -; CHECK-NEXT: ret <4 x i64> zeroinitializer - %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> ) - ret <4 x i64> %1 -} - -; -; SHL - Constant Vector -; - -define <8 x i16> @sse2_psll_w_0(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_w_0 -; CHECK-NEXT: ret <8 x i16> %v - %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> zeroinitializer) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psll_w_15(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_w_15 -; CHECK-NEXT: %1 = shl <8 x i16> %v, -; CHECK-NEXT: ret <8 x i16> %1 - %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psll_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_w_15_splat -; CHECK-NEXT: ret <8 x i16> zeroinitializer - %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <8 x i16> @sse2_psll_w_64(<8 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_w_64 -; CHECK-NEXT: ret <8 x i16> zeroinitializer - %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) - ret <8 x i16> %1 -} - -define <4 x i32> @sse2_psll_d_0(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_d_0 -; CHECK-NEXT: ret <4 x i32> %v - %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> zeroinitializer) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psll_d_15(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_d_15 -; CHECK-NEXT: %1 = shl <4 x i32> %v, -; CHECK-NEXT: ret <4 x i32> %1 - %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psll_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_d_15_splat -; CHECK-NEXT: ret <4 x i32> zeroinitializer - %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <4 x i32> @sse2_psll_d_64(<4 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_d_64 -; CHECK-NEXT: ret <4 x i32> zeroinitializer - %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) - ret <4 x i32> %1 -} - -define <2 x i64> @sse2_psll_q_0(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_q_0 -; CHECK-NEXT: ret <2 x i64> %v - %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> zeroinitializer) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_psll_q_15(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_q_15 -; CHECK-NEXT: %1 = shl <2 x i64> %v, -; CHECK-NEXT: ret <2 x i64> %1 - %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> ) - ret <2 x i64> %1 -} - -define <2 x i64> @sse2_psll_q_64(<2 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_q_64 -; CHECK-NEXT: ret <2 x i64> zeroinitializer - %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> ) - ret <2 x i64> %1 -} - -define <16 x i16> @avx2_psll_w_0(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_w_0 -; CHECK-NEXT: ret <16 x i16> %v - %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> zeroinitializer) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psll_w_15(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_w_15 -; CHECK-NEXT: %1 = shl <16 x i16> %v, -; CHECK-NEXT: ret <16 x i16> %1 - %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psll_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_w_15_splat -; CHECK-NEXT: ret <16 x i16> zeroinitializer - %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <16 x i16> @avx2_psll_w_64(<16 x i16> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_w_64 -; CHECK-NEXT: ret <16 x i16> zeroinitializer - %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) - ret <16 x i16> %1 -} - -define <8 x i32> @avx2_psll_d_0(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_d_0 -; CHECK-NEXT: ret <8 x i32> %v - %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> zeroinitializer) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psll_d_15(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_d_15 -; CHECK-NEXT: %1 = shl <8 x i32> %v, -; CHECK-NEXT: ret <8 x i32> %1 - %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psll_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_d_15_splat -; CHECK-NEXT: ret <8 x i32> zeroinitializer - %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <8 x i32> @avx2_psll_d_64(<8 x i32> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_d_64 -; CHECK-NEXT: ret <8 x i32> zeroinitializer - %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) - ret <8 x i32> %1 -} - -define <4 x i64> @avx2_psll_q_0(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_q_0 -; CHECK-NEXT: ret <4 x i64> %v - %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> zeroinitializer) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_psll_q_15(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_q_15 -; CHECK-NEXT: %1 = shl <4 x i64> %v, -; CHECK-NEXT: ret <4 x i64> %1 - %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> ) - ret <4 x i64> %1 -} - -define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_q_64 -; CHECK-NEXT: ret <4 x i64> zeroinitializer - %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> ) - ret <4 x i64> %1 -} - -; -; Vector Demanded Bits -; - -define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_w_var -; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <8 x i16> %1 - %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> - %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) - ret <8 x i16> %2 -} - -define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psra_d_var -; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <4 x i32> %1 - %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) - ret <4 x i32> %2 -} - -define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_w_var -; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <16 x i16> %1 - %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> - %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1) - ret <16 x i16> %2 -} - -define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psra_d_var -; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <8 x i32> %1 - %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) - ret <8 x i32> %2 -} - -define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_w_var -; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <8 x i16> %1 - %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> - %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1) - ret <8 x i16> %2 -} - -define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_d_var -; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <4 x i32> %1 - %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) - ret <4 x i32> %2 -} - -define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psrl_q_var -; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> - %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1) - ret <2 x i64> %2 -} - -define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_w_var -; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <16 x i16> %1 - %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> - %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) - ret <16 x i16> %2 -} - -define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_d_var -; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <8 x i32> %1 - %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) - ret <8 x i32> %2 -} - -define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psrl_q_var -; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <4 x i64> %1 - %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> - %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) - ret <4 x i64> %2 -} - -define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_w_var -; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <8 x i16> %1 - %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> - %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1) - ret <8 x i16> %2 -} - -define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_d_var -; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <4 x i32> %1 - %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1) - ret <4 x i32> %2 -} - -define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { -; CHECK-LABEL: @sse2_psll_q_var -; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <2 x i64> %1 - %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> - %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) - ret <2 x i64> %2 -} - -define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_w_var -; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a) -; CHECK-NEXT: ret <16 x i16> %1 - %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> - %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) - ret <16 x i16> %2 -} - -define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_d_var -; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a) -; CHECK-NEXT: ret <8 x i32> %1 - %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1) - ret <8 x i32> %2 -} - -define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { -; CHECK-LABEL: @avx2_psll_q_var -; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a) -; CHECK-NEXT: ret <4 x i64> %1 - %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> - %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1) - ret <4 x i64> %2 -} - -; -; Constant Folding -; - -define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) { -; CHECK-LABEL: @test_sse2_psra_w_0 -; CHECK-NEXT: ret <8 x i16> %A - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) - %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) - %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) - ret <8 x i16> %3 -} - -define <8 x i16> @test_sse2_psra_w_8() { -; CHECK-LABEL: @test_sse2_psra_w_8 -; CHECK-NEXT: ret <8 x i16> - %1 = bitcast <2 x i64> to <8 x i16> - %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3) - %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> ) - %4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2) - ret <8 x i16> %4 -} - -define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) { -; CHECK-LABEL: @test_sse2_psra_d_0 -; CHECK-NEXT: ret <4 x i32> %A - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) - %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) - %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0) - ret <4 x i32> %3 -} - -define <4 x i32> @sse2_psra_d_8() { -; CHECK-LABEL: @sse2_psra_d_8 -; CHECK-NEXT: ret <4 x i32> - %1 = bitcast <2 x i64> to <4 x i32> - %2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3) - %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> ) - %4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2) - ret <4 x i32> %4 -} - -define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) { -; CHECK-LABEL: @test_avx2_psra_w_0 -; CHECK-NEXT: ret <16 x i16> %A - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) - %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) - %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) - ret <16 x i16> %3 -} - -define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) { -; CHECK-LABEL: @test_avx2_psra_w_8 -; CHECK-NEXT: ret <16 x i16> - %1 = bitcast <4 x i64> to <16 x i16> - %2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3) - %3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> ) - %4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2) - ret <16 x i16> %4 -} - -define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) { -; CHECK-LABEL: @test_avx2_psra_d_0 -; CHECK-NEXT: ret <8 x i32> %A - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) - %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) - %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) - ret <8 x i32> %3 -} - -define <8 x i32> @test_avx2_psra_d_8() { -; CHECK-LABEL: @test_avx2_psra_d_8 -; CHECK-NEXT: ret <8 x i32> - %1 = bitcast <4 x i64> to <8 x i32> - %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3) - %3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> ) - %4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2) - ret <8 x i32> %4 -} - -define <2 x i64> @test_sse2_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_1 -; CHECK: ret <2 x i64> -} - -define <4 x i64> @test_avx2_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_1 -; CHECK: ret <4 x i64> -} - -define <2 x i64> @test_sse2_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_0 -; CHECK: ret <2 x i64> zeroinitializer -} - -define <4 x i64> @test_avx2_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_0 -; CHECK: ret <4 x i64> zeroinitializer -} -define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_psrl_1 -; CHECK: ret <2 x i64> -} - -define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_psrl_1 -; CHECK: ret <4 x i64> -} - -define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_psrl_0 -; CHECK: ret <2 x i64> zeroinitializer -} - -define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_psrl_0 -; CHECK: ret <4 x i64> zeroinitializer -} - -declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 -declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 -declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 -declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 -declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 -declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 - -declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 -declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 -declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 -declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 -declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 -declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 - -declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1 -declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1 - -attributes #1 = { nounwind readnone } +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; +; ASHR - Immediate +; + +define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_15 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_64 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_15 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_64 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_15 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_64 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_15 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_64 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64) + ret <8 x i32> %1 +} + +; +; LSHR - Immediate +; + +define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrli_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_w_15 +; CHECK-NEXT: %1 = lshr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 15) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrli_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_w_64 +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 64) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psrli_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 0) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrli_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_d_15 +; CHECK-NEXT: %1 = lshr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 15) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrli_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_d_64 +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 64) + ret <4 x i32> %1 +} + +define <2 x i64> @sse2_psrli_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_q_0 +; CHECK-NEXT: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 0) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_psrli_q_15(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_q_15 +; CHECK-NEXT: %1 = lshr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 15) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_psrli_q_64(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrli_q_64 +; CHECK-NEXT: ret <2 x i64> zeroinitializer + %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %v, i32 64) + ret <2 x i64> %1 +} + +define <16 x i16> @avx2_psrli_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 0) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrli_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_w_15 +; CHECK-NEXT: %1 = lshr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 15) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrli_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_w_64 +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %v, i32 64) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psrli_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 0) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrli_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_d_15 +; CHECK-NEXT: %1 = lshr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 15) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrli_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_d_64 +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %v, i32 64) + ret <8 x i32> %1 +} + +define <4 x i64> @avx2_psrli_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_q_0 +; CHECK-NEXT: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 0) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_psrli_q_15(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_q_15 +; CHECK-NEXT: %1 = lshr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 + %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 15) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_psrli_q_64(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrli_q_64 +; CHECK-NEXT: ret <4 x i64> zeroinitializer + %1 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 64) + ret <4 x i64> %1 +} + +; +; SHL - Immediate +; + +define <8 x i16> @sse2_pslli_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 0) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_pslli_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_w_15 +; CHECK-NEXT: %1 = shl <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 15) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_pslli_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_w_64 +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %v, i32 64) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_pslli_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 0) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_pslli_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_d_15 +; CHECK-NEXT: %1 = shl <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 15) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_pslli_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_d_64 +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %v, i32 64) + ret <4 x i32> %1 +} + +define <2 x i64> @sse2_pslli_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_q_0 +; CHECK-NEXT: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 0) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_pslli_q_15(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_q_15 +; CHECK-NEXT: %1 = shl <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 15) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_pslli_q_64(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_pslli_q_64 +; CHECK-NEXT: ret <2 x i64> zeroinitializer + %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 64) + ret <2 x i64> %1 +} + +define <16 x i16> @avx2_pslli_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 0) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_pslli_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_w_15 +; CHECK-NEXT: %1 = shl <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 15) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_pslli_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_w_64 +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 64) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_pslli_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 0) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_pslli_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_d_15 +; CHECK-NEXT: %1 = shl <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 15) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_pslli_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_d_64 +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %v, i32 64) + ret <8 x i32> %1 +} + +define <4 x i64> @avx2_pslli_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_q_0 +; CHECK-NEXT: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 0) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_pslli_q_15(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_q_15 +; CHECK-NEXT: %1 = shl <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 + %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 15) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_pslli_q_64 +; CHECK-NEXT: ret <4 x i64> zeroinitializer + %1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64) + ret <4 x i64> %1 +} + +; +; ASHR - Constant Vector +; + +define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15_splat +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_64 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15_splat +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_64 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15_splat +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_64 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15_splat +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_64 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +; +; LSHR - Constant Vector +; + +define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrl_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_15 +; CHECK-NEXT: %1 = lshr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrl_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_15_splat +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrl_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_64 +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psrl_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrl_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_15 +; CHECK-NEXT: %1 = lshr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrl_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_15_splat +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrl_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_64 +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <2 x i64> @sse2_psrl_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_0 +; CHECK-NEXT: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_psrl_q_15(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_15 +; CHECK-NEXT: %1 = lshr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> ) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_psrl_q_64(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_64 +; CHECK-NEXT: ret <2 x i64> zeroinitializer + %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> ) + ret <2 x i64> %1 +} + +define <16 x i16> @avx2_psrl_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrl_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_15 +; CHECK-NEXT: %1 = lshr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrl_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_15_splat +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrl_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_64 +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psrl_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrl_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_15 +; CHECK-NEXT: %1 = lshr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrl_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_15_splat +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrl_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_64 +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <4 x i64> @avx2_psrl_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_0 +; CHECK-NEXT: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_psrl_q_15(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_15 +; CHECK-NEXT: %1 = lshr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 + %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> ) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_psrl_q_64(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_64 +; CHECK-NEXT: ret <4 x i64> zeroinitializer + %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> ) + ret <4 x i64> %1 +} + +; +; SHL - Constant Vector +; + +define <8 x i16> @sse2_psll_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psll_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_15 +; CHECK-NEXT: %1 = shl <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psll_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_15_splat +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psll_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_64 +; CHECK-NEXT: ret <8 x i16> zeroinitializer + %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psll_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psll_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_15 +; CHECK-NEXT: %1 = shl <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psll_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_15_splat +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psll_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_64 +; CHECK-NEXT: ret <4 x i32> zeroinitializer + %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <2 x i64> @sse2_psll_q_0(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_0 +; CHECK-NEXT: ret <2 x i64> %v + %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_psll_q_15(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_15 +; CHECK-NEXT: %1 = shl <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> %1 + %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> ) + ret <2 x i64> %1 +} + +define <2 x i64> @sse2_psll_q_64(<2 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_64 +; CHECK-NEXT: ret <2 x i64> zeroinitializer + %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> ) + ret <2 x i64> %1 +} + +define <16 x i16> @avx2_psll_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psll_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_15 +; CHECK-NEXT: %1 = shl <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psll_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_15_splat +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psll_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_64 +; CHECK-NEXT: ret <16 x i16> zeroinitializer + %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psll_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psll_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_15 +; CHECK-NEXT: %1 = shl <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psll_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_15_splat +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psll_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_64 +; CHECK-NEXT: ret <8 x i32> zeroinitializer + %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <4 x i64> @avx2_psll_q_0(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_0 +; CHECK-NEXT: ret <4 x i64> %v + %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_psll_q_15(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_15 +; CHECK-NEXT: %1 = shl <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> %1 + %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> ) + ret <4 x i64> %1 +} + +define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_64 +; CHECK-NEXT: ret <4 x i64> zeroinitializer + %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> ) + ret <4 x i64> %1 +} + +; +; Vector Demanded Bits +; + +define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrl_q_var +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrl_q_var +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_w_var +; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <8 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_d_var +; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <4 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psll_q_var +; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_w_var +; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <16 x i16> %1 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_d_var +; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <8 x i32> %1 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psll_q_var +; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> %1 + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +; +; Constant Folding +; + +define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) { +; CHECK-LABEL: @test_sse2_psra_w_0 +; CHECK-NEXT: ret <8 x i16> %A + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) + %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) + ret <8 x i16> %3 +} + +define <8 x i16> @test_sse2_psra_w_8() { +; CHECK-LABEL: @test_sse2_psra_w_8 +; CHECK-NEXT: ret <8 x i16> + %1 = bitcast <2 x i64> to <8 x i16> + %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3) + %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> ) + %4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} + +define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) { +; CHECK-LABEL: @test_sse2_psra_d_0 +; CHECK-NEXT: ret <4 x i32> %A + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0) + ret <4 x i32> %3 +} + +define <4 x i32> @sse2_psra_d_8() { +; CHECK-LABEL: @sse2_psra_d_8 +; CHECK-NEXT: ret <4 x i32> + %1 = bitcast <2 x i64> to <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3) + %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> ) + %4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} + +define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) { +; CHECK-LABEL: @test_avx2_psra_w_0 +; CHECK-NEXT: ret <16 x i16> %A + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) + %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) + ret <16 x i16> %3 +} + +define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) { +; CHECK-LABEL: @test_avx2_psra_w_8 +; CHECK-NEXT: ret <16 x i16> + %1 = bitcast <4 x i64> to <16 x i16> + %2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3) + %3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> ) + %4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2) + ret <16 x i16> %4 +} + +define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) { +; CHECK-LABEL: @test_avx2_psra_d_0 +; CHECK-NEXT: ret <8 x i32> %A + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) + %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) + ret <8 x i32> %3 +} + +define <8 x i32> @test_avx2_psra_d_8() { +; CHECK-LABEL: @test_avx2_psra_d_8 +; CHECK-NEXT: ret <8 x i32> + %1 = bitcast <4 x i64> to <8 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3) + %3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> ) + %4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2) + ret <8 x i32> %4 +} + +define <2 x i64> @test_sse2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_1 +; CHECK: ret <2 x i64> +} + +define <4 x i64> @test_avx2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_1 +; CHECK: ret <4 x i64> +} + +define <2 x i64> @test_sse2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_0 +; CHECK: ret <2 x i64> zeroinitializer +} + +define <4 x i64> @test_avx2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_0 +; CHECK: ret <4 x i64> zeroinitializer +} +define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_psrl_1 +; CHECK: ret <2 x i64> +} + +define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_psrl_1 +; CHECK: ret <4 x i64> +} + +define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 +; CHECK: test_sse2_psrl_0 +; CHECK: ret <2 x i64> zeroinitializer +} + +define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_psrl_0 +; CHECK: ret <4 x i64> zeroinitializer +} + +declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 +declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 + +declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 +declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 +declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 + +declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1 +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1 + +attributes #1 = { nounwind readnone }