(64 to 128-bit) matches against the pattern fragment 'vzmovl_v2i64'
(a zero-extended 64-bit load).
However, a change in r248784 teaches the instruction combiner that only
the lower 64 bits of the input to a 128-bit vcvtph2ps are used. This means
the instruction combiner will ordinarily optimize away the upper 64-bit
insertelement instruction in the zero-extension and so we no longer select
the memory-register form. To fix this a new pattern has been added.
Differential Revision: http://reviews.llvm.org/D16067
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257470
91177308-0d34-0410-b5e6-
96231b3b80d8
(VCVTPH2PSrm addr:$src)>;
def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
(VCVTPH2PSrm addr:$src)>;
+ def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
(int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
ret <4 x float> %res
}
+define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; CHECK-NOT: vmov
+; CHECK: vcvtph2ps (%
+
+ %load = load i64, i64* %ptr
+ %ins = insertelement <2 x i64> undef, i64 %load, i32 0
+ %bc = bitcast <2 x i64> %ins to <8 x i16>
+ %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc)
+ ret <4 x float> %res
+}
+
define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
entry:
; CHECK-LABEL: test_x86_vcvtps2ph_256_m: