[AArch64]Enable the narrow ld promotion only on profitable microarchitectures

author Jun Bum Lim <junbuml@codeaurora.org>

Fri, 6 Nov 2015 16:27:47 +0000 (16:27 +0000)

committer Jun Bum Lim <junbuml@codeaurora.org>

Fri, 6 Nov 2015 16:27:47 +0000 (16:27 +0000)
author Jun Bum Lim <junbuml@codeaurora.org>
Fri, 6 Nov 2015 16:27:47 +0000 (16:27 +0000)
committer Jun Bum Lim <junbuml@codeaurora.org>
Fri, 6 Nov 2015 16:27:47 +0000 (16:27 +0000)
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

index 2ed3be7f8ab7bf91cc280bd11c3707e415acba47..6ef4c269d8fe9d4f18dad04b714da518d90160c3 100644 (file)
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -78,13 +78,12 @@ typedef struct LdStPairFlags {
  
  struct AArch64LoadStoreOpt : public MachineFunctionPass {
    static char ID;
-  AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) {
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
      initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
    }
  
    const AArch64InstrInfo *TII;
    const TargetRegisterInfo *TRI;
-  bool IsStrictAlign;
  
    // Scan the instructions looking for a load/store that can be combined
    // with the current instruction into a load/store pair.
@@ -127,7 +126,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
    // Find and merge foldable ldr/str instructions.
    bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
  
-  bool optimizeBlock(MachineBasicBlock &MBB);
+  // Check if converting two narrow loads into a single wider load with
+  // bitfield extracts could be enabled.
+  bool enableNarrowLdMerge(MachineFunction &Fn);
+
+  bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
  
    bool runOnMachineFunction(MachineFunction &Fn) override;
  
@@ -1161,7 +1164,8 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst(
    return false;
  }
  
-bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+                                        bool enableNarrowLdOpt) {
    bool Modified = false;
    // Three tranformations to do here:
    // 1) Find halfword loads that can be merged into a single 32-bit word load
@@ -1189,7 +1193,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
    //        ldr x0, [x2], #4
  
    for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       !IsStrictAlign && MBBI != E;) {
+       enableNarrowLdOpt && MBBI != E;) {
      MachineInstr *MI = MBBI;
      switch (MI->getOpcode()) {
      default:
@@ -1372,15 +1376,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
    return Modified;
  }
  
+bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
+  const AArch64Subtarget *SubTarget =
+      &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  bool ProfitableArch = SubTarget->isCortexA57();
+  // FIXME: The benefit from converting narrow loads into a wider load could be
+  // microarchitectural as it assumes that a single load with two bitfield
+  // extracts is cheaper than two narrow loads. Currently, this conversion is
+  // enabled only in cortex-a57 on which performance benefits were verified.
+  return ProfitableArch & (!SubTarget->requiresStrictAlign());
+}
+
  bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
    TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
    TRI = Fn.getSubtarget().getRegisterInfo();
-  IsStrictAlign = (static_cast<const AArch64Subtarget &>(Fn.getSubtarget()))
-                      .requiresStrictAlign();
  
    bool Modified = false;
+  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
    for (auto &MBB : Fn)
-    Modified |= optimizeBlock(MBB);
+    Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
  
    return Modified;
  }
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll

index f667ca46afa0dd553c40c32169d4f56662dcfcbe..6071d092f8b3736aa021db2431592ba1d7d6ae72 100644 (file)
--- a/test/CodeGen/AArch64/arm64-ldp.ll
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -356,51 +356,3 @@ define i64 @ldp_sext_int_post(i32* %p) nounwind {
    ret i64 %add
  }
  
-; CHECK-LABEL: Ldrh_merge
-; CHECK-NOT: ldrh
-; CHECK: ldr [[NEW_DEST:w[0-9]+]]
-; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
-; CHECK: lsr  w{{[0-9]+}}, [[NEW_DEST]]
-
-define i16 @Ldrh_merge(i16* nocapture readonly %p) {
-  %1 = load i16, i16* %p, align 2
-  ;%conv = zext i16 %0 to i32
-  %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
-  %2 = load i16, i16* %arrayidx2, align 2
-  %add = add nuw nsw i16 %1, %2
-  ret i16 %add
-}
-
-; CHECK-LABEL: Ldurh_merge
-; CHECK-NOT: ldurh
-; CHECK: ldur [[NEW_DEST:w[0-9]+]]
-; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
-; CHECK: lsr  w{{[0-9]+}}, [[NEW_DEST]]
-define i16 @Ldurh_merge(i16* nocapture readonly %p)  {
-entry:
-  %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
-  %0 = load i16, i16* %arrayidx
-  %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
-  %1 = load i16, i16* %arrayidx3
-  %add = add nuw nsw i16 %0, %1
-  ret i16 %add
-}
-
-; CHECK-LABEL: Ldrh_4_merge
-; CHECK-NOT: ldrh
-; CHECK: ldp [[NEW_DEST:w[0-9]+]]
-define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
-  %arrayidx = getelementptr inbounds i16, i16* %P, i64 0
-  %l0 = load i16, i16* %arrayidx
-  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
-  %l1 = load i16, i16* %arrayidx2
-  %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
-  %l2 = load i16, i16* %arrayidx7
-  %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
-  %l3 = load i16, i16* %arrayidx12
-  %add4 = add nuw nsw i16 %l1, %l0
-  %add9 = add nuw nsw i16 %add4, %l2
-  %add14 = add nuw nsw i16 %add9, %l3
-
-  ret i16 %add14
-}
diff --git a/test/CodeGen/AArch64/arm64-ldr-merge.ll b/test/CodeGen/AArch64/arm64-ldr-merge.ll

new file mode 100644 (file)

index 0000000..4e40bac
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldr-merge.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: Ldrh_merge
+; CHECK-NOT: ldrh
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
+; CHECK: lsr  w{{[0-9]+}}, [[NEW_DEST]]
+define i16 @Ldrh_merge(i16* nocapture readonly %p) {
+  %1 = load i16, i16* %p, align 2
+  %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
+  %2 = load i16, i16* %arrayidx2, align 2
+  %add = add nuw nsw i16 %1, %2
+  ret i16 %add
+}
+
+; CHECK-LABEL: Ldurh_merge
+; CHECK-NOT: ldurh
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
+; CHECK: lsr  w{{[0-9]+}}, [[NEW_DEST]]
+define i16 @Ldurh_merge(i16* nocapture readonly %p)  {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
+  %0 = load i16, i16* %arrayidx
+  %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
+  %1 = load i16, i16* %arrayidx3
+  %add = add nuw nsw i16 %0, %1
+  ret i16 %add
+}
+
+; CHECK-LABEL: Ldrh_4_merge
+; CHECK-NOT: ldrh
+; CHECK: ldp [[NEW_DEST:w[0-9]+]]
+define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 0
+  %l0 = load i16, i16* %arrayidx
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
+  %l1 = load i16, i16* %arrayidx2
+  %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
+  %l2 = load i16, i16* %arrayidx7
+  %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
+  %l3 = load i16, i16* %arrayidx12
+  %add4 = add nuw nsw i16 %l1, %l0
+  %add9 = add nuw nsw i16 %add4, %l2
+  %add14 = add nuw nsw i16 %add9, %l3
+  ret i16 %add14
+}
author	Jun Bum Lim <junbuml@codeaurora.org>
	Fri, 6 Nov 2015 16:27:47 +0000 (16:27 +0000)
committer	Jun Bum Lim <junbuml@codeaurora.org>
	Fri, 6 Nov 2015 16:27:47 +0000 (16:27 +0000)
lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp		patch \| blob \| history
test/CodeGen/AArch64/arm64-ldp.ll		patch \| blob \| history
test/CodeGen/AArch64/arm64-ldr-merge.ll	[new file with mode: 0644]	patch \| blob