ARM::D4, ARM::D5, ARM::D6, ARM::D7,
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
- // VFP3
+ // VFP3: D8-D15 are callee saved and should be allocated last.
+ // Save other low registers for use as DPR_VFP2 and DPR_8 classes.
static const unsigned ARM_DPR_VFP3[] = {
- ARM::D0, ARM::D1, ARM::D2, ARM::D3,
- ARM::D4, ARM::D5, ARM::D6, ARM::D7,
- ARM::D8, ARM::D9, ARM::D10, ARM::D11,
- ARM::D12, ARM::D13, ARM::D14, ARM::D15,
ARM::D16, ARM::D17, ARM::D18, ARM::D19,
ARM::D20, ARM::D21, ARM::D22, ARM::D23,
ARM::D24, ARM::D25, ARM::D26, ARM::D27,
- ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
+ ARM::D28, ARM::D29, ARM::D30, ARM::D31,
+ ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+ ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+ ARM::D8, ARM::D9, ARM::D10, ARM::D11,
+ ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
+
DPRClass::iterator
DPRClass::allocation_order_begin(const MachineFunction &MF) const {
const TargetMachine &TM = MF.getTarget();
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> {
let SubRegClasses = [(DPR dsub_0, dsub_1)];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Q4-Q7 are callee saved and should be allocated last.
+ // Save other low registers for use as QPR_VFP2 and QPR_8 classes.
+ static const unsigned ARM_QPR[] = {
+ ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11,
+ ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15,
+ ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
+ ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7 };
+
+ QPRClass::iterator
+ QPRClass::allocation_order_begin(const MachineFunction &MF) const {
+ return ARM_QPR;
+ }
+
+ QPRClass::iterator
+ QPRClass::allocation_order_end(const MachineFunction &MF) const {
+ return ARM_QPR + (sizeof(ARM_QPR)/sizeof(unsigned));
+ }
+ }];
}
// Subset of QPR that have 32-bit SPR subregs.
[QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> {
let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
(QPR qsub_0, qsub_1)];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // QQ2-QQ3 are callee saved and should be allocated last.
+ // Save other low registers for use as QPR_VFP2 and QPR_8 classes.
+ static const unsigned ARM_QQPR[] = {
+ ARM::QQ4, ARM::QQ5, ARM::QQ6, ARM::QQ7,
+ ARM::QQ0, ARM::QQ1, ARM::QQ2, ARM::QQ3 };
+
+ QQPRClass::iterator
+ QQPRClass::allocation_order_begin(const MachineFunction &MF) const {
+ return ARM_QQPR;
+ }
+
+ QQPRClass::iterator
+ QQPRClass::allocation_order_end(const MachineFunction &MF) const {
+ return ARM_QQPR + (sizeof(ARM_QQPR)/sizeof(unsigned));
+ }
+ }];
}
// Subset of QQPR that have 32-bit SPR subregs.
let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
dsub_4, dsub_5, dsub_6, dsub_7),
(QPR qsub_0, qsub_1, qsub_2, qsub_3)];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // QQQQ1 is callee saved and should be allocated last.
+ // Save QQQQ0 for use as QPR_VFP2 and QPR_8 classes.
+ static const unsigned ARM_QQQQPR[] = {
+ ARM::QQQQ2, ARM::QQQQ3, ARM::QQQQ0, ARM::QQQQ1 };
+
+ QQQQPRClass::iterator
+ QQQQPRClass::allocation_order_begin(const MachineFunction &MF) const {
+ return ARM_QQQQPR;
+ }
+
+ QQQQPRClass::iterator
+ QQQQPRClass::allocation_order_end(const MachineFunction &MF) const {
+ return ARM_QQQQPR + (sizeof(ARM_QQQQPR)/sizeof(unsigned));
+ }
+ }];
}
// Condition code registers.
; %reg1028 gets allocated %Q0, and if %reg1030 is reloaded for the partial
; redef, it cannot also get %Q0.
-; CHECK: vld1.64 {d0, d1}, [r{{.}}]
-; CHECK-NOT: vld1.64 {d0, d1}
-; CHECK: vmov.f64 d3, d0
+; CHECK: vld1.64 {d16, d17}, [r{{.}}]
+; CHECK-NOT: vld1.64 {d16, d17}
+; CHECK: vmov.f64 d19, d16
define i32 @test(i8* %arg) nounwind {
entry:
define double @t2(double %x) nounwind readnone optsize {
entry:
; CHECK: t2:
-; CHECK: vmov.f64 d1, #3.000000e+00
+; CHECK: vmov.f64 d{{.*}}, #3.000000e+00
%0 = fadd double %x, 3.000000e+00
ret double %0
}
define double @t3(double %x) nounwind readnone optsize {
entry:
; CHECK: t3:
-; CHECK: vmov.f64 d1, #-1.300000e+01
+; CHECK: vmov.f64 d{{.*}}, #-1.300000e+01
%0 = fmul double %x, -1.300000e+01
ret double %0
}
entry:
; CHECK: vmov.I64 q15, #0
; CHECK: vmov.32 d30[0], r0
-; CHECK: vmov q0, q15
+; CHECK: vmov q8, q15
%tmp = alloca %struct.int32x4_t, align 16
call void asm sideeffect "vmov.I64 q15, #0\0Avmov.32 d30[0], $1\0Avmov ${0:q}, q15\0A", "=*w,r,~{d31},~{d30}"(%struct.int32x4_t* %tmp, i32 8192) nounwind
ret void
define void @t2() nounwind {
entry:
-; CHECK: vmov d30, d0
+; CHECK: vmov d30, d16
; CHECK: vmov.32 r0, d30[0]
%asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
ret void
return2:
; CHECK: %return2
; CHECK: vadd.i32
-; CHECK: vmov q1, q3
+; CHECK: vmov q9, q11
; CHECK-NOT: vmov
-; CHECK: vst2.32 {d0, d1, d2, d3}
+; CHECK: vst2.32 {d16, d17, d18, d19}
%tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
%tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
%tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
; CHECK: t5:
; CHECK: vldmia
-; CHECK: vmov q1, q0
+; CHECK: vmov q9, q8
; CHECK-NOT: vmov
-; CHECK: vld2.16 {d0[1], d2[1]}, [r0]
+; CHECK: vld2.16 {d16[1], d18[1]}, [r0]
; CHECK-NOT: vmov
; CHECK: vadd.i16
%tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1]
define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
; CHECK: t6:
; CHECK: vldr.64
-; CHECK: vmov d1, d0
-; CHECK-NEXT: vld2.8 {d0[1], d1[1]}
+; CHECK: vmov d17, d16
+; CHECK-NEXT: vld2.8 {d16[1], d17[1]}
%tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2]
%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
; CHECK: t7:
; CHECK: vld2.32
; CHECK: vst2.32
-; CHECK: vld1.32 {d0, d1},
-; CHECK: vmov q1, q0
+; CHECK: vld1.32 {d16, d17},
+; CHECK: vmov q9, q8
; CHECK-NOT: vmov
-; CHECK: vuzp.32 q0, q1
+; CHECK: vuzp.32 q8, q9
; CHECK: vst1.32
%0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2]
%1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
; PR7156
define arm_aapcs_vfpcc i32 @t8() nounwind {
; CHECK: t8:
-; CHECK: vrsqrte.f32 q0, q0
+; CHECK: vrsqrte.f32 q8, q8
bb.nph55.bb.nph55.split_crit_edge:
br label %bb3
define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
; CHECK: t9:
; CHECK: vldr.64
-; CHECK-NOT: vmov d{{.*}}, d0
-; CHECK: vmov.i32 d1
-; CHECK-NEXT: vstmia r0, {d0, d1}
-; CHECK-NEXT: vstmia r0, {d0, d1}
+; CHECK-NOT: vmov d{{.*}}, d16
+; CHECK: vmov.i32 d17
+; CHECK-NEXT: vstmia r0, {d16, d17}
+; CHECK-NEXT: vstmia r0, {d16, d17}
%3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
%4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
store <4 x float> %4, <4 x float>* undef, align 16
define arm_aapcs_vfpcc i32 @t10() nounwind {
entry:
; CHECK: t10:
-; CHECK: vmov.i32 q1, #0x3F000000
-; CHECK: vmov d0, d1
-; CHECK: vmla.f32 q0, q0, d0[0]
+; CHECK: vmov.i32 q9, #0x3F000000
+; CHECK: vmov d0, d17
+; CHECK: vmla.f32 q8, q8, d0[0]
%0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
%2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+ %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
%val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
br label %bb4
%18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
%19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
%20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
- %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
+ %tmp1 = fadd <4 x float> %20, %ld3
+ %tmp2 = fadd <4 x float> %tmp1, %ld4
+ %tmp3 = fadd <4 x float> %tmp2, %ld5
+ %tmp4 = fadd <4 x float> %tmp3, %ld6
+ %tmp5 = fadd <4 x float> %tmp4, %ld7
+ %tmp6 = fadd <4 x float> %tmp5, %ld8
+ %tmp7 = fadd <4 x float> %tmp6, %ld9
+ %tmp8 = fadd <4 x float> %tmp7, %ld10
+ %tmp9 = fadd <4 x float> %tmp8, %ld11
+ %21 = fadd <4 x float> %tmp9, %ld12
%22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
%tmp = extractelement <4 x i1> %22, i32 0
br i1 %tmp, label %bb193, label %bb186
; rdar://7923010
define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vcgt_zext:
-;CHECK: vcgt.f32 q0
-;CHECK: vmov.i32 q1, #0x1
-;CHECK: vand q0, q0, q1
+;CHECK: vcgt.f32 q8
+;CHECK: vmov.i32 q9, #0x1
+;CHECK: vand q8, q8, q9
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2
define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
entry:
-; CHECK: vmov.u16 r0, d0[1]
+; CHECK: vmov.u16 r0, d{{.*}}[1]
%arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
entry:
-; CHECK: vmov.u8 r0, d0[1]
+; CHECK: vmov.u8 r0, d{{.*}}[1]
%arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
entry:
-; CHECK: vmov.u16 r0, d0[1]
+; CHECK: vmov.u16 r0, d{{.*}}[1]
%arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
entry:
-; CHECK: vmov.u8 r0, d0[1]
+; CHECK: vmov.u8 r0, d{{.*}}[1]
%arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
define <8 x i8> @vld1i8(i8* %A) nounwind {
;CHECK: vld1i8:
;Check the alignment value. Max for this instruction is 64 bits:
-;CHECK: vld1.8 {d0}, [r0, :64]
+;CHECK: vld1.8 {d16}, [r0, :64]
%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
ret <8 x i8> %tmp1
}
define <16 x i8> @vld1Qi8(i8* %A) nounwind {
;CHECK: vld1Qi8:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vld1.8 {d0, d1}, [r0, :64]
+;CHECK: vld1.8 {d16, d17}, [r0, :64]
%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
ret <16 x i8> %tmp1
}
define <8 x i16> @vld1Qi16(i16* %A) nounwind {
;CHECK: vld1Qi16:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vld1.16 {d0, d1}, [r0, :128]
+;CHECK: vld1.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
ret <8 x i16> %tmp1
define <8 x i8> @vld2i8(i8* %A) nounwind {
;CHECK: vld2i8:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vld2.8 {d0, d1}, [r0, :64]
+;CHECK: vld2.8 {d16, d17}, [r0, :64]
%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
define <4 x i16> @vld2i16(i16* %A) nounwind {
;CHECK: vld2i16:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vld2.16 {d0, d1}, [r0, :128]
+;CHECK: vld2.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
define <1 x i64> @vld2i64(i64* %A) nounwind {
;CHECK: vld2i64:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vld1.64 {d0, d1}, [r0, :128]
+;CHECK: vld1.64 {d16, d17}, [r0, :128]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
define <16 x i8> @vld2Qi8(i8* %A) nounwind {
;CHECK: vld2Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld2.8 {d0, d1, d2, d3}, [r0, :64]
+;CHECK: vld2.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
define <8 x i16> @vld2Qi16(i16* %A) nounwind {
;CHECK: vld2Qi16:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld2.16 {d0, d1, d2, d3}, [r0, :128]
+;CHECK: vld2.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
define <4 x i32> @vld2Qi32(i32* %A) nounwind {
;CHECK: vld2Qi32:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld2.32 {d0, d1, d2, d3}, [r0, :256]
+;CHECK: vld2.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
%tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
define <8 x i8> @vld3i8(i8* %A) nounwind {
;CHECK: vld3i8:
;Check the alignment value. Max for this instruction is 64 bits:
-;CHECK: vld3.8 {d0, d1, d2}, [r0, :64]
+;CHECK: vld3.8 {d16, d17, d18}, [r0, :64]
%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
%tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
define <1 x i64> @vld3i64(i64* %A) nounwind {
;CHECK: vld3i64:
;Check the alignment value. Max for this instruction is 64 bits:
-;CHECK: vld1.64 {d0, d1, d2}, [r0, :64]
+;CHECK: vld1.64 {d16, d17, d18}, [r0, :64]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
define <16 x i8> @vld3Qi8(i8* %A) nounwind {
;CHECK: vld3Qi8:
;Check the alignment value. Max for this instruction is 64 bits:
-;CHECK: vld3.8 {d0, d2, d4}, [r0, :64]!
-;CHECK: vld3.8 {d1, d3, d5}, [r0, :64]
+;CHECK: vld3.8 {d16, d18, d20}, [r0, :64]!
+;CHECK: vld3.8 {d17, d19, d21}, [r0, :64]
%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
%tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
define <8 x i8> @vld4i8(i8* %A) nounwind {
;CHECK: vld4i8:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld4.8 {d0, d1, d2, d3}, [r0, :64]
+;CHECK: vld4.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
define <4 x i16> @vld4i16(i16* %A) nounwind {
;CHECK: vld4i16:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld4.16 {d0, d1, d2, d3}, [r0, :128]
+;CHECK: vld4.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
define <2 x i32> @vld4i32(i32* %A) nounwind {
;CHECK: vld4i32:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld4.32 {d0, d1, d2, d3}, [r0, :256]
+;CHECK: vld4.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
define <1 x i64> @vld4i64(i64* %A) nounwind {
;CHECK: vld4i64:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld1.64 {d0, d1, d2, d3}, [r0, :256]
+;CHECK: vld1.64 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
%tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
define <16 x i8> @vld4Qi8(i8* %A) nounwind {
;CHECK: vld4Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vld4.8 {d0, d2, d4, d6}, [r0, :256]!
-;CHECK: vld4.8 {d1, d3, d5, d7}, [r0, :256]
+;CHECK: vld4.8 {d16, d18, d20, d22}, [r0, :256]!
+;CHECK: vld4.8 {d17, d19, d21, d23}, [r0, :256]
%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
%tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
define <8 x i16> @vld4Qi16(i16* %A) nounwind {
;CHECK: vld4Qi16:
;Check for no alignment specifier.
-;CHECK: vld4.16 {d0, d2, d4, d6}, [r0]!
-;CHECK: vld4.16 {d1, d3, d5, d7}, [r0]
+;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
+;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
define <8 x i8> @v_movi8() nounwind {
;CHECK: v_movi8:
-;CHECK: vmov.i8 d0, #0x8
+;CHECK: vmov.i8 d{{.*}}, #0x8
ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <4 x i16> @v_movi16a() nounwind {
;CHECK: v_movi16a:
-;CHECK: vmov.i16 d0, #0x10
+;CHECK: vmov.i16 d{{.*}}, #0x10
ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
}
define <4 x i16> @v_movi16b() nounwind {
;CHECK: v_movi16b:
-;CHECK: vmov.i16 d0, #0x1000
+;CHECK: vmov.i16 d{{.*}}, #0x1000
ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <4 x i16> @v_mvni16a() nounwind {
;CHECK: v_mvni16a:
-;CHECK: vmvn.i16 d0, #0x10
+;CHECK: vmvn.i16 d{{.*}}, #0x10
ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
}
define <4 x i16> @v_mvni16b() nounwind {
;CHECK: v_mvni16b:
-;CHECK: vmvn.i16 d0, #0x1000
+;CHECK: vmvn.i16 d{{.*}}, #0x1000
ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
}
define <2 x i32> @v_movi32a() nounwind {
;CHECK: v_movi32a:
-;CHECK: vmov.i32 d0, #0x20
+;CHECK: vmov.i32 d{{.*}}, #0x20
ret <2 x i32> < i32 32, i32 32 >
}
define <2 x i32> @v_movi32b() nounwind {
;CHECK: v_movi32b:
-;CHECK: vmov.i32 d0, #0x2000
+;CHECK: vmov.i32 d{{.*}}, #0x2000
ret <2 x i32> < i32 8192, i32 8192 >
}
define <2 x i32> @v_movi32c() nounwind {
;CHECK: v_movi32c:
-;CHECK: vmov.i32 d0, #0x200000
+;CHECK: vmov.i32 d{{.*}}, #0x200000
ret <2 x i32> < i32 2097152, i32 2097152 >
}
define <2 x i32> @v_movi32d() nounwind {
;CHECK: v_movi32d:
-;CHECK: vmov.i32 d0, #0x20000000
+;CHECK: vmov.i32 d{{.*}}, #0x20000000
ret <2 x i32> < i32 536870912, i32 536870912 >
}
define <2 x i32> @v_movi32e() nounwind {
;CHECK: v_movi32e:
-;CHECK: vmov.i32 d0, #0x20FF
+;CHECK: vmov.i32 d{{.*}}, #0x20FF
ret <2 x i32> < i32 8447, i32 8447 >
}
define <2 x i32> @v_movi32f() nounwind {
;CHECK: v_movi32f:
-;CHECK: vmov.i32 d0, #0x20FFFF
+;CHECK: vmov.i32 d{{.*}}, #0x20FFFF
ret <2 x i32> < i32 2162687, i32 2162687 >
}
define <2 x i32> @v_mvni32a() nounwind {
;CHECK: v_mvni32a:
-;CHECK: vmvn.i32 d0, #0x20
+;CHECK: vmvn.i32 d{{.*}}, #0x20
ret <2 x i32> < i32 4294967263, i32 4294967263 >
}
define <2 x i32> @v_mvni32b() nounwind {
;CHECK: v_mvni32b:
-;CHECK: vmvn.i32 d0, #0x2000
+;CHECK: vmvn.i32 d{{.*}}, #0x2000
ret <2 x i32> < i32 4294959103, i32 4294959103 >
}
define <2 x i32> @v_mvni32c() nounwind {
;CHECK: v_mvni32c:
-;CHECK: vmvn.i32 d0, #0x200000
+;CHECK: vmvn.i32 d{{.*}}, #0x200000
ret <2 x i32> < i32 4292870143, i32 4292870143 >
}
define <2 x i32> @v_mvni32d() nounwind {
;CHECK: v_mvni32d:
-;CHECK: vmvn.i32 d0, #0x20000000
+;CHECK: vmvn.i32 d{{.*}}, #0x20000000
ret <2 x i32> < i32 3758096383, i32 3758096383 >
}
define <2 x i32> @v_mvni32e() nounwind {
;CHECK: v_mvni32e:
-;CHECK: vmvn.i32 d0, #0x20FF
+;CHECK: vmvn.i32 d{{.*}}, #0x20FF
ret <2 x i32> < i32 4294958848, i32 4294958848 >
}
define <2 x i32> @v_mvni32f() nounwind {
;CHECK: v_mvni32f:
-;CHECK: vmvn.i32 d0, #0x20FFFF
+;CHECK: vmvn.i32 d{{.*}}, #0x20FFFF
ret <2 x i32> < i32 4292804608, i32 4292804608 >
}
define <1 x i64> @v_movi64() nounwind {
;CHECK: v_movi64:
-;CHECK: vmov.i64 d0, #0xFF0000FF0000FFFF
+;CHECK: vmov.i64 d{{.*}}, #0xFF0000FF0000FFFF
ret <1 x i64> < i64 18374687574888349695 >
}
define <16 x i8> @v_movQi8() nounwind {
;CHECK: v_movQi8:
-;CHECK: vmov.i8 q0, #0x8
+;CHECK: vmov.i8 q{{.*}}, #0x8
ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <8 x i16> @v_movQi16a() nounwind {
;CHECK: v_movQi16a:
-;CHECK: vmov.i16 q0, #0x10
+;CHECK: vmov.i16 q{{.*}}, #0x10
ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
}
define <8 x i16> @v_movQi16b() nounwind {
;CHECK: v_movQi16b:
-;CHECK: vmov.i16 q0, #0x1000
+;CHECK: vmov.i16 q{{.*}}, #0x1000
ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <4 x i32> @v_movQi32a() nounwind {
;CHECK: v_movQi32a:
-;CHECK: vmov.i32 q0, #0x20
+;CHECK: vmov.i32 q{{.*}}, #0x20
ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
}
define <4 x i32> @v_movQi32b() nounwind {
;CHECK: v_movQi32b:
-;CHECK: vmov.i32 q0, #0x2000
+;CHECK: vmov.i32 q{{.*}}, #0x2000
ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
}
define <4 x i32> @v_movQi32c() nounwind {
;CHECK: v_movQi32c:
-;CHECK: vmov.i32 q0, #0x200000
+;CHECK: vmov.i32 q{{.*}}, #0x200000
ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
}
define <4 x i32> @v_movQi32d() nounwind {
;CHECK: v_movQi32d:
-;CHECK: vmov.i32 q0, #0x20000000
+;CHECK: vmov.i32 q{{.*}}, #0x20000000
ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
}
define <4 x i32> @v_movQi32e() nounwind {
;CHECK: v_movQi32e:
-;CHECK: vmov.i32 q0, #0x20FF
+;CHECK: vmov.i32 q{{.*}}, #0x20FF
ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
}
define <4 x i32> @v_movQi32f() nounwind {
;CHECK: v_movQi32f:
-;CHECK: vmov.i32 q0, #0x20FFFF
+;CHECK: vmov.i32 q{{.*}}, #0x20FFFF
ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
}
define <2 x i64> @v_movQi64() nounwind {
;CHECK: v_movQi64:
-;CHECK: vmov.i64 q0, #0xFF0000FF0000FFFF
+;CHECK: vmov.i64 q{{.*}}, #0xFF0000FF0000FFFF
ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
}
define void @vdupn128(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
entry:
;CHECK: vdupn128:
-;CHECK: vmov.i8 d0, #0x80
+;CHECK: vmov.i8 d{{.*}}, #0x80
%0 = getelementptr inbounds %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
store <8 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>, <8 x i8>* %0, align 8
ret void
define void @vdupnneg75(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
entry:
;CHECK: vdupnneg75:
-;CHECK: vmov.i8 d0, #0xB5
+;CHECK: vmov.i8 d{{.*}}, #0xB5
%0 = getelementptr inbounds %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
store <8 x i8> <i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75, i8 -75>, <8 x i8>* %0, align 8
ret void
define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst1i8:
;Check the alignment value. Max for this instruction is 64 bits:
-;CHECK: vst1.8 {d0}, [r0, :64]
+;CHECK: vst1.8 {d16}, [r0, :64]
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
ret void
define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst1Qi8:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vst1.8 {d0, d1}, [r0, :64]
+;CHECK: vst1.8 {d16, d17}, [r0, :64]
%tmp1 = load <16 x i8>* %B
call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
ret void
define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst1Qi16:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vst1.16 {d0, d1}, [r0, :128]
+;CHECK: vst1.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst2i8:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vst2.8 {d0, d1}, [r0, :64]
+;CHECK: vst2.8 {d16, d17}, [r0, :64]
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
ret void
define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst2i16:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vst2.16 {d0, d1}, [r0, :128]
+;CHECK: vst2.16 {d16, d17}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst2i64:
;Check the alignment value. Max for this instruction is 128 bits:
-;CHECK: vst1.64 {d0, d1}, [r0, :128]
+;CHECK: vst1.64 {d16, d17}, [r0, :128]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>* %B
call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst2Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst2.8 {d0, d1, d2, d3}, [r0, :64]
+;CHECK: vst2.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = load <16 x i8>* %B
call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
ret void
define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst2Qi16:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst2.16 {d0, d1, d2, d3}, [r0, :128]
+;CHECK: vst2.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst2Qi32:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst2.32 {d0, d1, d2, d3}, [r0, :256]
+;CHECK: vst2.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>* %B
call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst4i8:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst4.8 {d0, d1, d2, d3}, [r0, :64]
+;CHECK: vst4.8 {d16, d17, d18, d19}, [r0, :64]
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
ret void
define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4i16:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst4.16 {d0, d1, d2, d3}, [r0, :128]
+;CHECK: vst4.16 {d16, d17, d18, d19}, [r0, :128]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst4i32:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst4.32 {d0, d1, d2, d3}, [r0, :256]
+;CHECK: vst4.32 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>* %B
call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst4i64:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst1.64 {d0, d1, d2, d3}, [r0, :256]
+;CHECK: vst1.64 {d16, d17, d18, d19}, [r0, :256]
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>* %B
call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst4Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
-;CHECK: vst4.8 {d0, d2, d4, d6}, [r0, :256]!
-;CHECK: vst4.8 {d1, d3, d5, d7}, [r0, :256]
+;CHECK: vst4.8 {d16, d18, d20, d22}, [r0, :256]!
+;CHECK: vst4.8 {d17, d19, d21, d23}, [r0, :256]
%tmp1 = load <16 x i8>* %B
call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
ret void
define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst4Qi16:
;Check for no alignment specifier.
-;CHECK: vst4.16 {d0, d2, d4, d6}, [r0]!
-;CHECK: vst4.16 {d1, d3, d5, d7}, [r0]
+;CHECK: vst4.16 {d16, d18, d20, d22}, [r0]!
+;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
%4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
; Constant pool load followed by add.
; Then clobber the loaded register, not the sum.
-; CHECK: vldr.64 [[LDR:d.]]
-; CHECK: vadd.f64 [[ADD:d.]], [[LDR]], [[LDR]]
+; CHECK: vldr.64 [[LDR:d.*]],
+; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]
; CHECK: vmov.f64 [[LDR]]
%5 = fadd <2 x double> %3, %3 ; <<2 x double>> [#uses=2]
%6 = fadd <2 x double> %4, %4 ; <<2 x double>> [#uses=2]
%0 = fmul double %a, %b
; CORTEXM3: blx ___muldf3
; CORTEXM4: blx ___muldf3
-; CORTEXA8: vmul.f64 d0, d1, d0
+; CORTEXA8: vmul.f64 d16, d17, d16
ret double %0
}
entry:
; CHECK: t2:
; CHECK: adr r{{.}}, #LCPI1_0
-; CHECK: vldmia r3, {d0, d1}
+; CHECK: vldmia r3, {d16, d17}
br i1 undef, label %bb1, label %bb2
bb1:
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
+ %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
+ %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
+ store float 0.000000e+00, float* undef, align 4
%val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
br label %bb4
%18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
%19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
%20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
- %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
+ %tmp1 = fadd <4 x float> %20, %ld3
+ %tmp2 = fadd <4 x float> %tmp1, %ld4
+ %tmp3 = fadd <4 x float> %tmp2, %ld5
+ %tmp4 = fadd <4 x float> %tmp3, %ld6
+ %tmp5 = fadd <4 x float> %tmp4, %ld7
+ %tmp6 = fadd <4 x float> %tmp5, %ld8
+ %tmp7 = fadd <4 x float> %tmp6, %ld9
+ %tmp8 = fadd <4 x float> %tmp7, %ld10
+ %tmp9 = fadd <4 x float> %tmp8, %ld11
+ %21 = fadd <4 x float> %tmp9, %ld12
%22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
%tmp = extractelement <4 x i1> %22, i32 0
br i1 %tmp, label %bb193, label %bb186