[x86] Teach the v4f32 path of the new shuffle lowering to handle the

author Chandler Carruth <chandlerc@gmail.com>

Sat, 20 Sep 2014 04:15:22 +0000 (04:15 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Sat, 20 Sep 2014 04:15:22 +0000 (04:15 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Sat, 20 Sep 2014 04:15:22 +0000 (04:15 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Sat, 20 Sep 2014 04:15:22 +0000 (04:15 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 7ca44b4615a6a38b384171170cc1ca0411e0db96..6ffda166ed209c11d4debe5af9ef0af19c4ff930 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7784,6 +7784,16 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
  
+  // There are special ways we can lower some single-element blends. However, we
+  // have custom ways we can lower more complex single-element blends below that
+  // we defer to if both this and BLENDPS fail to match, so restrict this to
+  // when the V2 input is targeting element 0 of the mask -- that is the fast
+  // case here.
+  if (NumV2Elements == 1 && Mask[0] >= 4)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
    if (Subtarget->hasSSE41())
      if (SDValue Blend =
              lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll

index 1dbc7f5e1dac4587e3abab2a456dbba085d55872..3645c9475a2edb70f3750ee4f1e55233842ba7b4 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -806,3 +806,74 @@ define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
    %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
    ret <4 x i32> %shuffle
  }
+
+define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
+; ALL-LABEL: @insert_reg_and_zero_v4i32
+; ALL:       # BB#0:
+; ALL-NEXT:    movd %edi, %xmm0
+; ALL-NEXT:    retq
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
+; ALL-LABEL: @insert_mem_and_zero_v4i32
+; ALL:       # BB#0:
+; ALL-NEXT:    movd (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a = load i32* %ptr
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE2-LABEL: @insert_reg_and_zero_v4f32
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    movss %xmm0, %[[X]]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: @insert_reg_and_zero_v4f32
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE3-NEXT:    movss %xmm0, %[[X]]
+; SSE3-NEXT:    movaps %[[X]], %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: @insert_reg_and_zero_v4f32
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSSE3-NEXT:    movss %xmm0, %[[X]]
+; SSSE3-NEXT:    movaps %[[X]], %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: @insert_reg_and_zero_v4f32
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE41-NEXT:    movss %xmm0, %[[X]]
+; SSE41-NEXT:    movaps %[[X]], %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @insert_reg_and_zero_v4f32
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %[[X:xmm[0-9]+]], %[[X]], %[[X]]
+; AVX1-NEXT:    vmovss %xmm0, %[[X]], %xmm0
+; AVX1-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; ALL-LABEL: @insert_mem_and_zero_v4f32
+; ALL:       # BB#0:
+; ALL-NEXT:    movss (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a = load float* %ptr
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
author	Chandler Carruth <chandlerc@gmail.com>
	Sat, 20 Sep 2014 04:15:22 +0000 (04:15 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Sat, 20 Sep 2014 04:15:22 +0000 (04:15 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v4.ll		patch \| blob \| history