From ed143b7c0cca568a7dc35e91d2e37207f79b8d76 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 23 Jun 2014 18:28:28 +0000 Subject: [PATCH] R600/SI: Fix div_scale intrinsic. The operand that must match one of the others does matter, and implement selecting for it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211523 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsR600.td | 7 +++- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 28 +++++++++++++ lib/Target/R600/AMDGPUISelLowering.cpp | 16 +++++++- lib/Target/R600/SIInstrInfo.td | 16 ++++++++ lib/Target/R600/SIInstructions.td | 6 ++- test/CodeGen/R600/llvm.AMDGPU.div_scale.ll | 47 +++++++++++++++++----- 6 files changed, 104 insertions(+), 16 deletions(-) diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td index 89b3ef3636e..2566cc2561e 100644 --- a/include/llvm/IR/IntrinsicsR600.td +++ b/include/llvm/IR/IntrinsicsR600.td @@ -38,8 +38,13 @@ defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < let TargetPrefix = "AMDGPU" in { def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">, + // 1st parameter: Numerator + // 2nd parameter: Denominator + // 3rd parameter: Constant to select select between first and + // second. (0 = first, 1 = second). Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], + [IntrNoMem]>; def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">, Intrinsic<[llvm_anyfloat_ty], diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 85d0f9dd691..37071bc3f1c 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -86,6 +86,7 @@ private: bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -454,6 +455,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { PackedOffsetWidth); } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } } return SelectCode(N); } @@ -695,6 +699,30 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + + SDValue Ops[] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + Zero, + Zero, + Zero, + Zero + }; + + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 6ff1703c919..ca8d0a1626b 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -771,9 +771,21 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::AMDGPU_div_scale: + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + Src0, Denominator, Numerator); + } case Intrinsic::AMDGPU_div_fmas: return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index c4994a250a8..dd9a6dc527f 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -433,6 +433,22 @@ class VOP3_64 op, string opName, list pattern> : VOP3 < opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern >, VOP ; + +class VOP3b_Helper op, RegisterClass vrc, RegisterClass arc, + string opName, list pattern> : VOP3 < + op, (outs vrc:$dst0, SReg_64:$dst1), + (ins arc:$src0, arc:$src1, arc:$src2, + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), + opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern +>, VOP ; + + +class VOP3b_64 op, string opName, list pattern> : + VOP3b_Helper ; + +class VOP3b_32 op, string opName, list pattern> : + VOP3b_Helper ; + //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index f888b8e6e05..6a29dbe77dd 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1455,8 +1455,10 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; } // isCommutable = 1 -defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; -def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; +def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; + +// Double precision division pre-scale. +def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll index 1bcbe2f859e..527c8da10a3 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll @@ -1,23 +1,48 @@ -; XFAIL: * ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -declare float @llvm.AMDGPU.div.scale.f32(float, float) nounwind readnone -declare double @llvm.AMDGPU.div.scale.f64(double, double) nounwind readnone +declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone +declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone -; SI-LABEL @test_div_scale_f32: -define void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { +; SI-LABEL @test_div_scale_f32_1: +; SI: V_DIV_SCALE_F32 +define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { %a = load float addrspace(1)* %aptr, align 4 %b = load float addrspace(1)* %bptr, align 4 - %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 ret void } -; SI-LABEL @test_div_scale_f64: -define void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr) nounwind { +; SI-LABEL @test_div_scale_f32_2: +; SI: V_DIV_SCALE_F32 +define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI: V_DIV_SCALE_F64 +define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %b = load double addrspace(1)* %bptr, align 8 + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI: V_DIV_SCALE_F64 +define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind { %a = load double addrspace(1)* %aptr, align 8 %b = load double addrspace(1)* %bptr, align 8 - %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 ret void } -- 2.34.1