From 3d476a80e9da0136681f69247abfbd6de986c137 Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Wed, 4 Sep 2013 17:41:16 +0000 Subject: [PATCH] Swift: Only build vldm/vstm with q register aligned register lists Unaligned vldm/vstm need more uops and therefore are slower in general on swift. radar://14522102 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189961 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 5 ++++- test/CodeGen/ARM/swift-vldm.ll | 28 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/ARM/swift-vldm.ll diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 1803a8a5887..237adb9b22e 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -489,7 +489,10 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, if (Reg != ARM::SP && NewOffset == Offset + (int)Size && ((isNotVFP && RegNum > PRegNum) || - ((Count < Limit) && RegNum == PRegNum+1))) { + ((Count < Limit) && RegNum == PRegNum+1)) && + // On Swift we don't want vldm/vstm to start with a odd register num + // because Q register unaligned vldm/vstm need more uops. + (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { Offset += Size; PRegNum = RegNum; ++Count; diff --git a/test/CodeGen/ARM/swift-vldm.ll b/test/CodeGen/ARM/swift-vldm.ll new file mode 100644 index 00000000000..6d76ee4e0b7 --- /dev/null +++ b/test/CodeGen/ARM/swift-vldm.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s + +; vldm with registers not aligned with q registers need more micro-ops so that +; so that there usage becomes unbeneficial on swift. + +; CHECK-LABEL: test_vldm +; CHECK: vldmia r1, {d18, d19, d20} +; CHECK-NOT: vldmia r1, {d17, d18, d19, d20} + +define double @test_vldm(double %a, double %b, double* nocapture %x) { +entry: + %mul73 = fmul double %a, %b + %addr1 = getelementptr double * %x, i32 1 + %addr2 = getelementptr double * %x, i32 2 + %addr3 = getelementptr double * %x, i32 3 + %load0 = load double * %x + %load1 = load double * %addr1 + %load2 = load double * %addr2 + %load3 = load double * %addr3 + %sub = fsub double %mul73, %load1 + %mul = fmul double %mul73, %load0 + %add = fadd double %mul73, %load2 + %div = fdiv double %mul73, %load3 + %red = fadd double %sub, %mul + %red2 = fadd double %div, %add + %red3 = fsub double %red, %red2 + ret double %red3 +} -- 2.34.1