From 272458bd06d0c6d09e9bf776fb60735b0cdc8cf1 Mon Sep 17 00:00:00 2001 From: Vincent Lejeune Date: Fri, 19 Jul 2013 21:45:15 +0000 Subject: [PATCH] R600: Don't emit empty then clause and use alu_pop_after git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186725 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDILCFGStructurizer.cpp | 8 +- lib/Target/R600/R600ControlFlowFinalizer.cpp | 48 +++++-- lib/Target/R600/R600Instructions.td | 1 + test/CodeGen/R600/jump-address.ll | 2 +- test/CodeGen/R600/loop-address.ll | 9 +- test/CodeGen/R600/r600cfg.ll | 124 +++++++++++++++++++ 6 files changed, 175 insertions(+), 17 deletions(-) create mode 100644 test/CodeGen/R600/r600cfg.ll diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index 85ac72542a4..fac56f07468 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -1039,8 +1039,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { } else if (FalseMBB->succ_size() == 1 && *FalseMBB->succ_begin() == TrueMBB) { // Triangle pattern, true is empty - LandBlk = TrueMBB; - TrueMBB = NULL; + // We reverse the predicate to make a triangle, empty false pattern; + std::swap(TrueMBB, FalseMBB); + reversePredicateSetter(MBB->end()); + LandBlk = FalseMBB; + FalseMBB = NULL; } else if (FalseMBB->succ_size() == 1 && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { LandBlk = *FalseMBB->succ_begin(); @@ -1456,6 +1459,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { + assert (TrueMBB); DEBUG( dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{ "; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 1cd0ac38a22..b69d38bbddd 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -347,6 +347,9 @@ public: MaxStack = 1; } std::vector FetchClauses, AluClauses; + std::vector LastAlu(1); + std::vector ToPopAfter; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { @@ -357,6 +360,10 @@ public: } MachineBasicBlock::iterator MI = I; + if (MI->getOpcode() != AMDGPU::ENDIF) + LastAlu.back() = 0; + if (MI->getOpcode() == AMDGPU::CF_ALU) + LastAlu.back() = MI; I++; switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: @@ -403,6 +410,7 @@ public: break; } case AMDGPU::IF_PREDICATE_SET: { + LastAlu.push_back(0); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(0) @@ -420,7 +428,7 @@ public: MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_ELSE)) .addImm(0) - .addImm(1); + .addImm(0); DEBUG(dbgs() << CfCount << ":"; MIb->dump();); IfThenElseStack.push_back(MIb); MI->eraseFromParent(); @@ -429,17 +437,24 @@ public: } case AMDGPU::ENDIF: { CurrentStack--; + if (LastAlu.back()) { + ToPopAfter.push_back(LastAlu.back()); + } else { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CfCount++; + } + MachineInstr *IfOrElseInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); - CounterPropagateAddr(IfOrElseInst, CfCount + 1); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_POP)) - .addImm(CfCount + 1) - .addImm(1); - (void)MIb; - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CounterPropagateAddr(IfOrElseInst, CfCount); + IfOrElseInst->getOperand(1).setImm(1); + LastAlu.pop_back(); MI->eraseFromParent(); - CfCount++; break; } case AMDGPU::PREDICATED_BREAK: { @@ -484,6 +499,21 @@ public: break; } } + for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { + MachineInstr *Alu = ToPopAfter[i]; + BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), + TII->get(AMDGPU::CF_ALU_POP_AFTER)) + .addImm(Alu->getOperand(0).getImm()) + .addImm(Alu->getOperand(1).getImm()) + .addImm(Alu->getOperand(2).getImm()) + .addImm(Alu->getOperand(3).getImm()) + .addImm(Alu->getOperand(4).getImm()) + .addImm(Alu->getOperand(5).getImm()) + .addImm(Alu->getOperand(6).getImm()) + .addImm(Alu->getOperand(7).getImm()) + .addImm(Alu->getOperand(8).getImm()); + Alu->eraseFromParent(); + } MFI->StackSize = getHWStackSize(MaxStack, HasPush); } diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index df5c438d51a..3652c89b727 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -624,6 +624,7 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { def CF_ALU : ALU_CLAUSE<8, "ALU">; def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; +def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; def FETCH_CLAUSE : AMDGPUInst <(outs), (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll index 9a5f1bc3acb..26c298b9d81 100644 --- a/test/CodeGen/R600/jump-address.ll +++ b/test/CodeGen/R600/jump-address.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; CHECK: JUMP @7 +; CHECK: JUMP @5 ; CHECK: EXPORT ; CHECK-NOT: EXPORT diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll index 8a5458b8980..23be327c6e3 100644 --- a/test/CodeGen/R600/loop-address.ll +++ b/test/CodeGen/R600/loop-address.ll @@ -2,12 +2,11 @@ ;CHECK: TEX ;CHECK: ALU_PUSH -;CHECK: JUMP @4 -;CHECK: ELSE @16 +;CHECK: JUMP @15 ;CHECK: TEX -;CHECK: LOOP_START_DX10 @15 -;CHECK: LOOP_BREAK @14 -;CHECK: POP @16 +;CHECK: LOOP_START_DX10 @14 +;CHECK: LOOP_BREAK @13 +;CHECK: POP @15 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" target triple = "r600--" diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll new file mode 100644 index 00000000000..895ad5e1c8d --- /dev/null +++ b/test/CodeGen/R600/r600cfg.ll @@ -0,0 +1,124 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood +;REQUIRES: asserts + +define void @main() #0 { +main_body: + %0 = call float @llvm.R600.load.input(i32 4) + %1 = call float @llvm.R600.load.input(i32 5) + %2 = call float @llvm.R600.load.input(i32 6) + %3 = call float @llvm.R600.load.input(i32 7) + %4 = bitcast float %0 to i32 + %5 = icmp eq i32 %4, 0 + %6 = sext i1 %5 to i32 + %7 = bitcast i32 %6 to float + %8 = bitcast float %7 to i32 + %9 = icmp ne i32 %8, 0 + %. = select i1 %9, float 0x36A0000000000000, float %0 + br label %LOOP + +LOOP: ; preds = %LOOP47, %main_body + %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ] + %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ] + %10 = bitcast float %temp4.1 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF41, label %ENDIF40 + +IF41: ; preds = %LOOP + %16 = insertelement <4 x float> undef, float %0, i32 0 + %17 = insertelement <4 x float> %16, float %temp8.0, i32 1 + %18 = insertelement <4 x float> %17, float %temp12.0, i32 2 + %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1) + %20 = insertelement <4 x float> undef, float %0, i32 0 + %21 = insertelement <4 x float> %20, float %temp8.0, i32 1 + %22 = insertelement <4 x float> %21, float %temp12.0, i32 2 + %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2) + %24 = insertelement <4 x float> undef, float %0, i32 0 + %25 = insertelement <4 x float> %24, float %temp8.0, i32 1 + %26 = insertelement <4 x float> %25, float %temp12.0, i32 2 + %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4) + %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1 + %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2 + %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1) + %32 = insertelement <4 x float> undef, float %0, i32 0 + %33 = insertelement <4 x float> %32, float %temp8.0, i32 1 + %34 = insertelement <4 x float> %33, float %temp12.0, i32 2 + %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2) + ret void + +ENDIF40: ; preds = %LOOP + %36 = bitcast float %temp8.0 to i32 + %37 = add i32 %36, 1 + %38 = bitcast i32 %37 to float + %39 = bitcast float %temp4.1 to i32 + %40 = urem i32 %39, 2 + %41 = bitcast i32 %40 to float + %42 = bitcast float %41 to i32 + %43 = icmp eq i32 %42, 0 + %44 = sext i1 %43 to i32 + %45 = bitcast i32 %44 to float + %46 = bitcast float %45 to i32 + %47 = icmp ne i32 %46, 0 + %48 = bitcast float %temp4.1 to i32 + br i1 %47, label %IF44, label %ELSE45 + +IF44: ; preds = %ENDIF40 + %49 = udiv i32 %48, 2 + br label %ENDIF43 + +ELSE45: ; preds = %ENDIF40 + %50 = mul i32 3, %48 + %51 = add i32 %50, 1 + br label %ENDIF43 + +ENDIF43: ; preds = %ELSE45, %IF44 + %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ] + %52 = bitcast i32 %.sink to float + %53 = load <4 x float> addrspace(8)* null + %54 = extractelement <4 x float> %53, i32 0 + %55 = bitcast float %54 to i32 + br label %LOOP47 + +LOOP47: ; preds = %ENDIF48, %ENDIF43 + %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ] + %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ] + %56 = bitcast float %temp28.0 to i32 + %57 = icmp uge i32 %56, %55 + %58 = sext i1 %57 to i32 + %59 = bitcast i32 %58 to float + %60 = bitcast float %59 to i32 + %61 = icmp ne i32 %60, 0 + br i1 %61, label %LOOP, label %ENDIF48 + +ENDIF48: ; preds = %LOOP47 + %62 = bitcast float %temp12.1 to i32 + %63 = mul i32 %62, 2 + %64 = bitcast i32 %63 to float + %65 = bitcast float %64 to i32 + %66 = urem i32 %65, 2147483647 + %67 = bitcast i32 %66 to float + %68 = bitcast float %temp28.0 to i32 + %69 = add i32 %68, 1 + %70 = bitcast i32 %69 to float + br label %LOOP47 +} + +; Function Attrs: readnone +declare float @llvm.R600.load.input(i32) #1 + +declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32) + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } -- 2.34.1