From: Ahmed Bougacha Date: Thu, 20 Aug 2015 21:02:39 +0000 (+0000) Subject: [X86] Look for scalar through one bitcast when lowering to VBROADCAST. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=14fea5bf0df997aff940953a8d35181d6e5c88ba [X86] Look for scalar through one bitcast when lowering to VBROADCAST. Fixes PR23464: one way to use the broadcast intrinsics is: _mm256_broadcastw_epi16(_mm_cvtsi32_si128(*(int*)src)); We don't currently fold this, but now that we use native IR for the intrinsics (r245605), we can look through one bitcast to find the broadcast scalar. Differential Revision: http://reviews.llvm.org/D10557 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245613 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ed347f6f9ce..30ace06f44b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7553,6 +7553,23 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) { + EVT EltVT = VT.getVectorElementType(); + SDValue V0 = V.getOperand(0); + EVT V0VT = V0.getValueType(); + + if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) && + ((V0.getOpcode() == ISD::BUILD_VECTOR || + (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) { + V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx)); + BroadcastIdx = 0; + } + } + + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f186738a3a9..3e072bc4bbc 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8338,6 +8338,13 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, v4i32 defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64>; let Predicates = [HasAVX2] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + // Provide aliases for broadcast from the same register class that // automatically does the extract. def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 06740f54943..a8ada6fbfa7 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1404,8 +1404,7 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) { ; ; AVX2-LABEL: insert_dup_mem_v16i8_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -1451,9 +1450,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { ; ; AVX2-LABEL: insert_dup_mem_v16i8_sext_i8: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: retq %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32 diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index bf4afee1875..f040a691938 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2175,8 +2175,7 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) { ; ; AVX2-LABEL: insert_dup_mem_v8i16_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 8904f826c6e..37ed743ea1d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3301,8 +3301,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) { ; ; AVX2-LABEL: insert_dup_mem_v16i16_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index f49b0e18703..b9686dbbf64 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1986,8 +1986,7 @@ define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) { ; ; AVX2-LABEL: insert_dup_mem_v32i8_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -2008,9 +2007,7 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) { ; ; AVX2-LABEL: insert_dup_mem_v32i8_sext_i8: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %tmp = load i8, i8* %ptr, align 1 %tmp1 = sext i8 %tmp to i32