-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
%struct.int16x8_t = type { <8 x i16> }
entry:
; CHECK: t2:
; CHECK: vld1.16
-; CHECK-NOT: vmov
-; CHECK: vmul.i16
; CHECK: vld1.16
-; CHECK: vst1.16
; CHECK-NOT: vmov
; CHECK: vmul.i16
+; CHECK: vmul.i16
+; CHECK-NOT: vmov
+; CHECK: vst1.16
; CHECK: vst1.16
%0 = getelementptr inbounds %struct.int16x8_t* %vT0ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
%1 = load <8 x i16>* %0, align 16 ; <<8 x i16>> [#uses=1]
define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
; CHECK: t9:
; CHECK: vldr.64
-; CHECK-NEXT: vstmia r0, {d0,d1}
-; CHECK-NEXT: vmov.i8 d1
+; CHECK: vmov.i8 d1
+; CHECK-NEXT: vstmia r0, {d2,d3}
; CHECK-NEXT: vstmia r0, {d0,d1}
%3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
%4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]