From 0f2eec65fb9e9e1dee3f672d38d03d047936a62a Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Sun, 23 Jun 2013 09:00:28 +0000 Subject: [PATCH] Add MI-Sched support for x86 macro fusion. This is an awful implementation of the target hook. But we don't have abstractions yet for common machine ops, and I don't see any quick way to make it table-driven. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184664 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 161 +++++++++++++++++++++++++++++ lib/Target/X86/X86InstrInfo.h | 3 + test/CodeGen/X86/misched-fusion.ll | 108 +++++++++++++++++++ 3 files changed, 272 insertions(+) create mode 100644 test/CodeGen/X86/misched-fusion.ll diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 0688c9ba5d2..0443a93137b 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -4647,6 +4647,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } +bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!TM.getSubtarget().hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + switch(Second->getOpcode()) { + default: + return false; + case X86::JE_4: + case X86::JNE_4: + case X86::JL_4: + case X86::JLE_4: + case X86::JG_4: + case X86::JGE_4: + FuseKind = FuseInc; + break; + case X86::JB_4: + case X86::JBE_4: + case X86::JA_4: + case X86::JAE_4: + FuseKind = FuseCmp; + break; + case X86::JS_4: + case X86::JNS_4: + case X86::JP_4: + case X86::JNP_4: + case X86::JO_4: + case X86::JNO_4: + FuseKind = FuseTest; + break; + } + switch (First->getOpcode()) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64_16r: + case X86::INC64_32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64_16r: + case X86::DEC64_32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + } +} bool X86InstrInfo:: ReverseBranchCondition(SmallVectorImpl &Cond) const { diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index f2f47efaef8..a0d1ba75aaa 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -339,6 +339,9 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const; + virtual bool shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const LLVM_OVERRIDE; + virtual void getNoopForMachoTarget(MCInst &NopInst) const; virtual diff --git a/test/CodeGen/X86/misched-fusion.ll b/test/CodeGen/X86/misched-fusion.ll new file mode 100644 index 00000000000..859d92d6978 --- /dev/null +++ b/test/CodeGen/X86/misched-fusion.ll @@ -0,0 +1,108 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -disable-lsr -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s + +; Verify that TEST+JE are scheduled together. +; CHECK: test_je +; CHECK: %loop +; CHECK: test +; CHECK-NEXT: je +define void @test_je() { +entry: + br label %loop + +loop: + %var = phi i32* [ null, %entry ], [ %next.load, %loop1 ], [ %var, %loop2 ] + %next.ptr = phi i32** [ null, %entry ], [ %next.ptr, %loop1 ], [ %gep, %loop2 ] + br label %loop1 + +loop1: + %cond = icmp eq i32* %var, null + %next.load = load i32** %next.ptr + br i1 %cond, label %loop, label %loop2 + +loop2: ; preds = %loop1 + %gep = getelementptr inbounds i32** %next.ptr, i32 1 + store i32* %next.load, i32** undef + br label %loop +} + +; Verify that DEC+JE are scheduled together. +; CHECK: dec_je +; CHECK: %loop1 +; CHECK: dec +; CHECK-NEXT: je +define void @dec_je() { +entry: + br label %loop + +loop: + %var = phi i32 [ 0, %entry ], [ %next.var, %loop1 ], [ %var2, %loop2 ] + %next.ptr = phi i32** [ null, %entry ], [ %next.ptr, %loop1 ], [ %gep, %loop2 ] + br label %loop1 + +loop1: + %var2 = sub i32 %var, 1 + %cond = icmp eq i32 %var2, 0 + %next.load = load i32** %next.ptr + %next.var = load i32* %next.load + br i1 %cond, label %loop, label %loop2 + +loop2: + %gep = getelementptr inbounds i32** %next.ptr, i32 1 + store i32* %next.load, i32** undef + br label %loop +} + +; DEC+JS should *not* be scheduled together. +; CHECK: dec_js +; CHECK: %loop1 +; CHECK: dec +; CHECK: mov +; CHECK: js +define void @dec_js() { +entry: + br label %loop2a + +loop2a: ; preds = %loop1, %body, %entry + %var = phi i32 [ 0, %entry ], [ %next.var, %loop1 ], [ %var2, %loop2b ] + %next.ptr = phi i32** [ null, %entry ], [ %next.ptr, %loop1 ], [ %gep, %loop2b ] + br label %loop1 + +loop1: ; preds = %loop2a, %loop2b + %var2 = sub i32 %var, 1 + %cond = icmp slt i32 %var2, 0 + %next.load = load i32** %next.ptr + %next.var = load i32* %next.load + br i1 %cond, label %loop2a, label %loop2b + +loop2b: ; preds = %loop1 + %gep = getelementptr inbounds i32** %next.ptr, i32 1 + store i32* %next.load, i32** undef + br label %loop2a +} + +; Verify that CMP+JB are scheduled together. +; CHECK: cmp_jb +; CHECK: %loop1 +; CHECK: cmp +; CHECK-NEXT: jb +define void @cmp_jb(i32 %n) { +entry: + br label %loop2a + +loop2a: ; preds = %loop1, %body, %entry + %var = phi i32 [ 0, %entry ], [ %next.var, %loop1 ], [ %var2, %loop2b ] + %next.ptr = phi i32** [ null, %entry ], [ %next.ptr, %loop1 ], [ %gep, %loop2b ] + br label %loop1 + +loop1: ; preds = %loop2a, %loop2b + %var2 = sub i32 %var, 1 + %cond = icmp ult i32 %var2, %n + %next.load = load i32** %next.ptr + %next.var = load i32* %next.load + br i1 %cond, label %loop2a, label %loop2b + +loop2b: ; preds = %loop1 + %gep = getelementptr inbounds i32** %next.ptr, i32 1 + store i32* %next.load, i32** undef + br label %loop2a +} -- 2.34.1