From: Matt Arsenault Date: Fri, 20 Feb 2015 22:10:41 +0000 (+0000) Subject: R600: Use new fmad node. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=bbb748eece06d7bc0e4dbf32cc8212157a4600f8;p=oota-llvm.git R600: Use new fmad node. This enables a few useful combines that used to only use fma. Also since v_mad_f32 apparently does not support denormals, disable the existing cases that are custom handled if they are requested. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@230071 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index d96f03aaeb0..18697c8d3f6 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -133,6 +133,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -384,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -2611,7 +2618,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(MAD) NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index caf96539430..6bc6ca5bbc5 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -218,7 +218,6 @@ enum { DWORDADDR, FRACT, CLAMP, - MAD, // Multiply + add with same result as the separate operations. // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index d657ad05c8c..901eb5110f2 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -78,7 +78,6 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, >; def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; -def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>; // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 8b5fe8c09db..849b241f63e 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -413,11 +413,6 @@ def atomic_xor_global : global_binary_atomic_op; // Misc Pattern Fragments //===----------------------------------------------------------------------===// -def fmad : PatFrag < - (ops node:$src0, node:$src1, node:$src2), - (fadd (fmul node:$src0, node:$src1), node:$src2) ->; - class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 08e3d51fab6..291fb0459e2 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -920,7 +920,7 @@ class MULADD_Common inst> : R600_3OP < class MULADD_IEEE_Common inst> : R600_3OP < inst, "MULADD_IEEE", - [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] + [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] >; class FMA_Common inst> : R600_3OP < diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 2821a0ce475..74287144cc2 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -680,8 +680,9 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { case MVT::f32: // This is as fast on some subtargets. However, we always have full rate f32 // mad available which returns the same result as the separate operations - // which we should prefer over fma. - return false; + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); case MVT::f64: return true; default: @@ -1642,6 +1643,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (VT != MVT::f32) break; + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -1653,7 +1659,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); } } @@ -1662,11 +1668,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); } } - break; + return SDValue(); } case ISD::FSUB: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) @@ -1676,30 +1682,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, // Try to get the fneg to fold into the source modifier. This undoes generic // DAG combines and folds them into the mad. - if (VT == MVT::f32) { + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - - if (LHS.getOpcode() == ISD::FMUL) { - // (fsub (fmul a, b), c) -> mad a, b, (fneg c) - - SDValue A = LHS.getOperand(0); - SDValue B = LHS.getOperand(1); - SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - - if (RHS.getOpcode() == ISD::FMUL) { - // (fsub c, (fmul a, b)) -> mad (fneg a), b, c - - SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); - SDValue B = RHS.getOperand(1); - SDValue C = LHS; - - return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); - } - if (LHS.getOpcode() == ISD::FADD) { // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) @@ -1708,7 +1697,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); } } @@ -1718,9 +1707,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); } } + + return SDValue(); } break; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 2c28e03af88..4c5493ea5e4 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -2783,9 +2783,6 @@ def : Pat < (V_MUL_HI_I32 $src0, $src1) >; -def : Vop3ModPat; - - defm : BFIPatterns ; def : ROTRPattern ; diff --git a/test/CodeGen/R600/mad-combine.ll b/test/CodeGen/R600/mad-combine.ll new file mode 100644 index 00000000000..8c4e09bbea1 --- /dev/null +++ b/test/CodeGen/R600/mad-combine.ll @@ -0,0 +1,567 @@ +; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. + +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s + +; Make sure we don't form mad with denormals +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_mad_f32_0: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF-NOT: v_fma +; SI-DENORM-SLOWFMAF-NOT: v_mad + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fadd float %mul, %c + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1 + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + %d = load float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fadd float %mul, %c + %fma1 = fadd float %mul, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fadd x, (fmul y, z)) -> (fma y, z, x) +; FUNC-LABEL: {{^}}combine_to_mad_f32_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fadd float %c, %mul + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fsub float %mul, %c + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1 + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + %d = load float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fsub float %mul, %c + %fma1 = fsub float %mul, %d + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fsub float %c, %mul + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1 + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + %d = load float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fsub float %c, %mul + %fma1 = fsub float %d, %mul + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] + +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma = fsub float %mul.neg, %c + + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1 + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + %d = load float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma0 = fsub float %mul.neg, %c + %fma1 = fsub float %mul.neg, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1 + + %a = load float addrspace(1)* %gep.0 + %b = load float addrspace(1)* %gep.1 + %c = load float addrspace(1)* %gep.2 + %d = load float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma0 = fsub float %mul.neg, %c + %fma1 = fsub float %mul, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] +; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] + +; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %x = load float addrspace(1)* %gep.0 + %y = load float addrspace(1)* %gep.1 + %z = load float addrspace(1)* %gep.2 + %u = load float addrspace(1)* %gep.3 + %v = load float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 + %tmp2 = fsub float %tmp1, %z + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] +; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] + +; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %x = load float addrspace(1)* %gep.0 + %y = load float addrspace(1)* %gep.1 + %z = load float addrspace(1)* %gep.2 + %u = load float addrspace(1)* %gep.3 + %v = load float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub float %x, %tmp1 + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] + +; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %x = load float addrspace(1)* %gep.0 + %y = load float addrspace(1)* %gep.1 + %z = load float addrspace(1)* %gep.2 + %u = load float addrspace(1)* %gep.3 + %v = load float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 + %tmp2 = fsub float %tmp1, %z + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fmuladd y, z, (fmul u, v))) +; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] + +; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float addrspace(1)* %out, i32 %tid + + %x = load float addrspace(1)* %gep.0 + %y = load float addrspace(1)* %gep.1 + %z = load float addrspace(1)* %gep.2 + %u = load float addrspace(1)* %gep.3 + %v = load float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub float %x, %tmp1 + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }