return 0;
}
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+ EVT VT = Op.getValueType();
+ bool Is256 = VT.getSizeInBits() == 256;
+
+ assert((VT.getSizeInBits() == 128 || Is256) &&
+ "Unsupported type for vbroadcast node");
+
+ SDValue V = Op;
+ if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ if (Is256 && !(V.hasOneUse() &&
+ V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V.getOperand(0).getOpcode() == ISD::UNDEF))
+ return false;
+
+ if (Is256)
+ V = V.getOperand(1);
+ if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+
+ // Check the source scalar_to_vector type. 256-bit broadcasts are
+ // supported for 32/64-bit sizes, while 128-bit ones are only supported
+ // for 32-bit scalars.
+ unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+ if (ScalarSize != 32 && ScalarSize != 64)
+ return false;
+ if (!Is256 && ScalarSize == 64)
+ return false;
+
+ V = V.getOperand(0);
+ if (!MayFoldLoad(V))
+ return false;
+
+ // Return the load node
+ Op = V;
+ return true;
+}
+
static
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI,
if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
return Op;
+ // Use vbroadcast whenever the splat comes from a foldable load
+ if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+
// Handle splats by matching through known shuffle masks
if (VT.is128BitVector() && NumElem <= 4)
return SDValue();
case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
+ case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS";
case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY";
case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD";
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
(VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VBROADCASTSD addr:$src)>;
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VBROADCASTSD addr:$src)>;
+
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ (VBROADCASTSS addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VBROADCASTSS addr:$src)>;
+
//===----------------------------------------------------------------------===//
// VINSERTF128 - Insert packed floating-point values
//
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; XFAIL: *
+
+; xfail this file for now because of PR8156, when it gets solved merge this with avx-splat.ll
+
+; CHECK: vbroadcastsd (%
+define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load i64* %ptr, align 8
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
+ %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
+ %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
+ ret <4 x i64> %vecinit6.i
+}
+
+; CHECK: vbroadcastss (%
+define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load i32* %ptr, align 4
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
+ %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
+ %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
+ ret <8 x i32> %vecinit6.i
+}
+
+; CHECK: vbroadcastsd (%
+define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load double* %ptr, align 8
+ %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
+ %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
+ %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
+ %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
+ ret <4 x double> %vecinit6.i
+}
+
+; CHECK: vbroadcastss (%
+define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load float* %ptr, align 4
+ %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
+ %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
+ %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
+ %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
+ ret <8 x float> %vecinit6.i
+}
+
+;;;; 128-bit versions
+
+; CHECK: vbroadcastss (%
+define <4 x float> @E(float* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load float* %ptr, align 4
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+ %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+ %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+ ret <4 x float> %vecinit6.i
+}
+
+; CHECK: vbroadcastss (%
+define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load i32* %ptr, align 4
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
+ %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
+ %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
+ ret <4 x i32> %vecinit6.i
+}
+
+; Unsupported vbroadcasts
+
+; CHECK: _G
+; CHECK-NOT: vbroadcastsd (%
+define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
+entry:
+ %q = load i64* %ptr, align 8
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
+ ret <2 x i64> %vecinit2.i
+}