From: Cong Hou Date: Tue, 24 Nov 2015 19:51:26 +0000 (+0000) Subject: [X86] Fix several issues related to X86's psadbw instruction. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=5c1d0fd2040c07b467e486fc6d2aeb4da6dab12e [X86] Fix several issues related to X86's psadbw instruction. This patch fixes the following issues: 1. Fix the return type of X86psadbw: it should not be the same type of inputs. For vNi8 inputs the output should be vMi64, where M = N/8. 2. Fix the return type of int_x86_avx512_psad_bw_512 accordingly. 3. Fix the definiton of PSADBW, VPSADBW, and VPSADBWY accordingly. 4. Adjust the return type when building a DAG node of X86ISD::PSADBW type. 5. Update related tests. Differential revision: http://reviews.llvm.org/D14897 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254010 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 84cd35dd156..a07c4c46abc 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -5448,7 +5448,7 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } // FP logical ops diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e06e6fbcf52..59e16d0a094 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19316,7 +19316,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } @@ -19332,9 +19333,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); - High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index f5d02ef3608..8411ce6a2c0 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -7369,32 +7369,34 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", multiclass avx512_psadbw_packed opc, SDNode OpNode, - string OpcodeStr, X86VectorVTInfo _src>{ + string OpcodeStr, X86VectorVTInfo _dst, + X86VectorVTInfo _src>{ def rr : AVX512BI; + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>; let mayLoad = 1 in def rm : AVX512BI; } multiclass avx512_psadbw_packed_all opc, SDNode OpNode, string OpcodeStr, Predicate prd> { let Predicates = [prd] in - defm Z512 : avx512_psadbw_packed, - EVEX_V512; + defm Z512 : avx512_psadbw_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_psadbw_packed, - EVEX_V256; - defm Z128 : avx512_psadbw_packed, - EVEX_V128; + defm Z256 : avx512_psadbw_packed, EVEX_V256; + defm Z128 : avx512_psadbw_packed, EVEX_V128; } } diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 68891fa18db..1a0a19ceadd 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -79,8 +79,8 @@ def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86psadbw : SDNode<"X86ISD::PSADBW", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>, SDTCisInt<3>]>>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 14d40a7700d..a93240bd717 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4066,22 +4066,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_PMADD, 1>; - -let Predicates = [HasAVX2] in - def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPSADBWYrr VR256:$src2, VR256:$src1)>; let Predicates = [HasAVX] in - def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPSADBWrr VR128:$src2, VR128:$src1)>; - -def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PSADBWrr VR128:$src2, VR128:$src1)>; +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 0b77d480bc0..4784d9c3758 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -284,6 +284,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), @@ -1710,6 +1711,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 9e702be1ef4..7b9768c9594 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1255,15 +1255,15 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) { %res2 = add <8 x i64> %res, %res1 ret <8 x i64> %res2 } -declare <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) +declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) ; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512 ; CHECK-NOT: call ; CHECK: vpsadbw %zmm1 ; CHECK: vpsadbw %zmm2 -define <64 x i8>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ - %res = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) - %res1 = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 +define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ + %res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) + %res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 } diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index 2ef88ef3fd6..358bd401829 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -92,7 +92,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out @@ -207,9 +207,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index c05744df277..b0e39bdf49f 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -15,14 +15,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -37,7 +37,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out @@ -57,9 +57,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsadbw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 @@ -68,9 +68,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -87,9 +87,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 0fabfc5b2d2..f1714d4845d 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -275,9 +275,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -297,9 +297,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) @@ -430,9 +430,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -452,9 +452,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index a469a1670d7..a9ee27cc51f 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -30,7 +30,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -49,7 +49,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) ret <4 x i64> %out @@ -75,7 +75,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -83,7 +83,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -102,7 +102,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) ret <4 x i64> %out @@ -129,9 +129,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 @@ -141,9 +141,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -164,9 +164,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) @@ -194,9 +194,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 @@ -206,9 +206,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -229,9 +229,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)