X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=test%2FTransforms%2FInstCombine%2Fvec_demanded_elts.ll;h=35ba09313d6f6c7cb51e1344ddb6dffdc361db30;hb=96363d50018da6e9cfa549695f01aa0d4d2d6a31;hp=0019a57627cb222d2d613be72f8aafccb4ea0fbe;hpb=4ee312bac138a18c39353d4736eb5921532f9624;p=oota-llvm.git diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index 0019a57627c..35ba09313d6 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -1,8 +1,9 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define i16 @test1(float %f) { entry: -; CHECK: @test1 +; CHECK-LABEL: @test1( ; CHECK: fmul float ; CHECK-NOT: insertelement {{.*}} 0.00 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul @@ -22,7 +23,7 @@ entry: } define i32 @test2(float %f) { -; CHECK: @test2 +; CHECK-LABEL: @test2( ; CHECK-NOT: insertelement ; CHECK-NOT: extractelement ; CHECK: ret @@ -37,7 +38,7 @@ define i32 @test2(float %f) { } define i64 @test3(float %f, double %d) { -; CHECK: @test3 +; CHECK-LABEL: @test3( ; CHECK-NOT: insertelement {{.*}} 0.00 ; CHECK: ret entry: @@ -85,7 +86,7 @@ entry: } define void @get_image() nounwind { -; CHECK: @get_image +; CHECK-LABEL: @get_image( ; CHECK-NOT: extractelement ; CHECK: unreachable entry: @@ -105,7 +106,7 @@ bb3: ; preds = %bb2, %entry ; PR4340 define void @vac(<4 x float>* nocapture %a) nounwind { -; CHECK: @vac +; CHECK-LABEL: @vac( ; CHECK-NOT: load ; CHECK: ret entry: @@ -155,7 +156,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { entry: -; CHECK: define <4 x float> @dead_shuffle_elt +; CHECK-LABEL: define <4 x float> @dead_shuffle_elt( ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> @@ -163,7 +164,7 @@ entry: } define <2 x float> @test_fptrunc(double %f) { -; CHECK: @test_fptrunc +; CHECK-LABEL: @test_fptrunc( ; CHECK: insertelement ; CHECK: insertelement ; CHECK-NOT: insertelement @@ -177,7 +178,7 @@ define <2 x float> @test_fptrunc(double %f) { } define <2 x double> @test_fpext(float %f) { -; CHECK: @test_fpext +; CHECK-LABEL: @test_fpext( ; CHECK: insertelement ; CHECK: insertelement ; CHECK-NOT: insertelement @@ -191,7 +192,7 @@ define <2 x double> @test_fpext(float %f) { } define <4 x float> @test_select(float %f, float %g) { -; CHECK: @test_select +; CHECK-LABEL: @test_select( ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0 ; CHECK-NOT: insertelement ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3 @@ -209,4 +210,240 @@ define <4 x float> @test_select(float %f, float %g) { ret <4 x float> %ret } +; We should optimize these two redundant insertqi into one +; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { +; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) +; CHECK-NOT: insertqi + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %2 +} + +; The result of this insert is the second arg, since the top 64 bits of +; the result are undefined, and we copy the bottom 64 bits from the +; second arg +; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) + ret <2 x i64> %1 +} + +; Test the several types of ranges and ordering that exist for two insertqi +; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi +declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind + +declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) +define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { +; CHECK-LABEL: @test_vpermilvar_ps( +; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> ) + ret <4 x float> %a +} + +declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) +define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) { +; CHECK-LABEL: @test_vpermilvar_ps_256( +; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> + %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> ) + ret <8 x float> %a +} + +declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i32>) +define <2 x double> @test_vpermilvar_pd(<2 x double> %v) { +; CHECK-LABEL: @test_vpermilvar_pd( +; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> ) + ret <2 x double> %a +} + +declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i32>) +define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { +; CHECK-LABEL: @test_vpermilvar_pd_256( +; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> + %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> ) + ret <4 x double> %a +} + +define <2 x i64> @test_sse2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 + +; CHECK: test_sse2_1 +; CHECK: ret <2 x i64> +} + +define <4 x i64> @test_avx2_1() nounwind readnone uwtable { + %S = bitcast i32 1 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_1 +; CHECK: ret <4 x i64> +} + +define <2 x i64> @test_sse2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> , <8 x i16> %4) + %6 = bitcast <8 x i16> %5 to <4 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) + %9 = bitcast <4 x i32> %8 to <2 x i64> + %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) + %11 = bitcast <2 x i64> %10 to <8 x i16> + %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) + %13 = bitcast <8 x i16> %12 to <4 x i32> + %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) + %15 = bitcast <4 x i32> %14 to <2 x i64> + %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) + ret <2 x i64> %16 + +; CHECK: test_sse2_0 +; CHECK: ret <2 x i64> zeroinitializer +} + +define <4 x i64> @test_avx2_0() nounwind readnone uwtable { + %S = bitcast i32 128 to i32 + %1 = zext i32 %S to i64 + %2 = insertelement <2 x i64> undef, i64 %1, i32 0 + %3 = insertelement <2 x i64> %2, i64 0, i32 1 + %4 = bitcast <2 x i64> %3 to <8 x i16> + %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> , <8 x i16> %4) + %6 = bitcast <16 x i16> %5 to <8 x i32> + %7 = bitcast <2 x i64> %3 to <4 x i32> + %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) + %9 = bitcast <8 x i32> %8 to <4 x i64> + %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) + %11 = bitcast <4 x i64> %10 to <16 x i16> + %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) + %13 = bitcast <16 x i16> %12 to <8 x i32> + %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) + %15 = bitcast <8 x i32> %14 to <4 x i64> + %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) + ret <4 x i64> %16 +; CHECK: test_avx2_0 +; CHECK: ret <4 x i64> zeroinitializer +} + +declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 +declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 +attributes #1 = { nounwind readnone }