1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Custom DAG lowering for R600
13 //===----------------------------------------------------------------------===//
15 #include "R600ISelLowering.h"
16 #include "R600Defines.h"
17 #include "R600InstrInfo.h"
18 #include "R600MachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/CodeGen/SelectionDAG.h"
22 #include "llvm/IR/Argument.h"
23 #include "llvm/IR/Function.h"
27 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
28 AMDGPUTargetLowering(TM),
29 TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
30 setOperationAction(ISD::MUL, MVT::i64, Expand);
31 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
32 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
33 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
34 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
35 computeRegisterProperties();
37 setOperationAction(ISD::FADD, MVT::v4f32, Expand);
38 setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
39 setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
40 setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
42 setOperationAction(ISD::ADD, MVT::v4i32, Expand);
43 setOperationAction(ISD::AND, MVT::v4i32, Expand);
44 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
45 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
46 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
47 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
48 setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
49 setOperationAction(ISD::UREM, MVT::v4i32, Expand);
50 setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
52 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
53 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
55 setOperationAction(ISD::FSUB, MVT::f32, Expand);
57 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
58 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
59 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
60 setOperationAction(ISD::FPOW, MVT::f32, Custom);
62 setOperationAction(ISD::ROTL, MVT::i32, Custom);
64 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
65 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
67 setOperationAction(ISD::SETCC, MVT::i32, Custom);
68 setOperationAction(ISD::SETCC, MVT::f32, Custom);
69 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
71 setOperationAction(ISD::SELECT, MVT::i32, Custom);
72 setOperationAction(ISD::SELECT, MVT::f32, Custom);
74 setOperationAction(ISD::STORE, MVT::i32, Custom);
75 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
77 setOperationAction(ISD::LOAD, MVT::i32, Custom);
78 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
79 setTargetDAGCombine(ISD::FP_ROUND);
80 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
82 setSchedulingPreference(Sched::VLIW);
85 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
86 MachineInstr * MI, MachineBasicBlock * BB) const {
87 MachineFunction * MF = BB->getParent();
88 MachineRegisterInfo &MRI = MF->getRegInfo();
89 MachineBasicBlock::iterator I = *MI;
91 switch (MI->getOpcode()) {
92 default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
93 case AMDGPU::SHADER_TYPE: break;
94 case AMDGPU::CLAMP_R600: {
95 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
97 MI->getOperand(0).getReg(),
98 MI->getOperand(1).getReg());
99 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
103 case AMDGPU::FABS_R600: {
104 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
106 MI->getOperand(0).getReg(),
107 MI->getOperand(1).getReg());
108 TII->addFlag(NewMI, 0, MO_FLAG_ABS);
112 case AMDGPU::FNEG_R600: {
113 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
115 MI->getOperand(0).getReg(),
116 MI->getOperand(1).getReg());
117 TII->addFlag(NewMI, 0, MO_FLAG_NEG);
121 case AMDGPU::MASK_WRITE: {
122 unsigned maskedRegister = MI->getOperand(0).getReg();
123 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
124 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
125 TII->addFlag(defInstr, 0, MO_FLAG_MASK);
129 case AMDGPU::MOV_IMM_F32:
130 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
131 MI->getOperand(1).getFPImm()->getValueAPF()
132 .bitcastToAPInt().getZExtValue());
134 case AMDGPU::MOV_IMM_I32:
135 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
136 MI->getOperand(1).getImm());
140 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
141 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
142 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
144 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
145 .addOperand(MI->getOperand(0))
146 .addOperand(MI->getOperand(1))
147 .addImm(EOP); // Set End of program bit
152 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
153 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
155 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
156 .addOperand(MI->getOperand(3))
157 .addOperand(MI->getOperand(4))
158 .addOperand(MI->getOperand(5))
159 .addOperand(MI->getOperand(6));
160 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
161 .addOperand(MI->getOperand(2))
162 .addOperand(MI->getOperand(4))
163 .addOperand(MI->getOperand(5))
164 .addOperand(MI->getOperand(6));
165 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
166 .addOperand(MI->getOperand(0))
167 .addOperand(MI->getOperand(1))
168 .addOperand(MI->getOperand(4))
169 .addOperand(MI->getOperand(5))
170 .addOperand(MI->getOperand(6))
171 .addReg(T0, RegState::Implicit)
172 .addReg(T1, RegState::Implicit);
176 case AMDGPU::TXD_SHADOW: {
177 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
178 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
180 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
181 .addOperand(MI->getOperand(3))
182 .addOperand(MI->getOperand(4))
183 .addOperand(MI->getOperand(5))
184 .addOperand(MI->getOperand(6));
185 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
186 .addOperand(MI->getOperand(2))
187 .addOperand(MI->getOperand(4))
188 .addOperand(MI->getOperand(5))
189 .addOperand(MI->getOperand(6));
190 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
191 .addOperand(MI->getOperand(0))
192 .addOperand(MI->getOperand(1))
193 .addOperand(MI->getOperand(4))
194 .addOperand(MI->getOperand(5))
195 .addOperand(MI->getOperand(6))
196 .addReg(T0, RegState::Implicit)
197 .addReg(T1, RegState::Implicit);
202 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
203 .addOperand(MI->getOperand(0))
207 case AMDGPU::BRANCH_COND_f32: {
208 MachineInstr *NewMI =
209 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
210 AMDGPU::PREDICATE_BIT)
211 .addOperand(MI->getOperand(1))
212 .addImm(OPCODE_IS_NOT_ZERO)
214 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
215 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
216 .addOperand(MI->getOperand(0))
217 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
221 case AMDGPU::BRANCH_COND_i32: {
222 MachineInstr *NewMI =
223 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
224 AMDGPU::PREDICATE_BIT)
225 .addOperand(MI->getOperand(1))
226 .addImm(OPCODE_IS_NOT_ZERO_INT)
228 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
229 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
230 .addOperand(MI->getOperand(0))
231 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
235 case AMDGPU::EG_ExportSwz:
236 case AMDGPU::R600_ExportSwz: {
237 // Instruction is left unmodified if its not the last one of its type
238 bool isLastInstructionOfItsType = true;
239 unsigned InstExportType = MI->getOperand(1).getImm();
240 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
241 EndBlock = BB->end(); NextExportInst != EndBlock;
242 NextExportInst = llvm::next(NextExportInst)) {
243 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
244 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
245 unsigned CurrentInstExportType = NextExportInst->getOperand(1)
247 if (CurrentInstExportType == InstExportType) {
248 isLastInstructionOfItsType = false;
253 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
254 if (!EOP && !isLastInstructionOfItsType)
256 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
257 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
258 .addOperand(MI->getOperand(0))
259 .addOperand(MI->getOperand(1))
260 .addOperand(MI->getOperand(2))
261 .addOperand(MI->getOperand(3))
262 .addOperand(MI->getOperand(4))
263 .addOperand(MI->getOperand(5))
264 .addOperand(MI->getOperand(6))
269 case AMDGPU::RETURN: {
270 // RETURN instructions must have the live-out registers as implicit uses,
271 // otherwise they appear dead.
272 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
273 MachineInstrBuilder MIB(*MF, MI);
274 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
275 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
280 MI->eraseFromParent();
284 //===----------------------------------------------------------------------===//
285 // Custom DAG Lowering Operations
286 //===----------------------------------------------------------------------===//
288 using namespace llvm::Intrinsic;
289 using namespace llvm::AMDGPUIntrinsic;
292 InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
293 unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
294 SDValue Scalar, SDValue Chain) {
295 if (!ExportMap[Slot]) {
296 SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
298 DAG.getUNDEF(MVT::v4f32),
300 DAG.getConstant(Channel, MVT::i32));
302 unsigned Mask = 1 << Channel;
304 const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
305 DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
306 DAG.getConstant(Mask, MVT::i32)};
308 SDValue Res = DAG.getNode(
313 ExportMap[Slot] = Res.getNode();
317 SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
318 SDValue PreviousVector = ExportInstruction->getOperand(1);
319 SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
323 DAG.getConstant(Channel, MVT::i32));
325 unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
327 Mask |= (1 << Channel);
329 const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
330 DAG.getConstant(Inst, MVT::i32),
331 DAG.getConstant(Type, MVT::i32),
332 DAG.getConstant(Slot, MVT::i32),
333 DAG.getConstant(Mask, MVT::i32)};
335 DAG.UpdateNodeOperands(ExportInstruction,
342 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
343 switch (Op.getOpcode()) {
344 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
345 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
346 case ISD::ROTL: return LowerROTL(Op, DAG);
347 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
348 case ISD::SELECT: return LowerSELECT(Op, DAG);
349 case ISD::SETCC: return LowerSETCC(Op, DAG);
350 case ISD::STORE: return LowerSTORE(Op, DAG);
351 case ISD::LOAD: return LowerLOAD(Op, DAG);
352 case ISD::FPOW: return LowerFPOW(Op, DAG);
353 case ISD::INTRINSIC_VOID: {
354 SDValue Chain = Op.getOperand(0);
355 unsigned IntrinsicID =
356 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
357 switch (IntrinsicID) {
358 case AMDGPUIntrinsic::AMDGPU_store_output: {
359 MachineFunction &MF = DAG.getMachineFunction();
360 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
361 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
362 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
363 MFI->LiveOuts.push_back(Reg);
364 return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
366 case AMDGPUIntrinsic::R600_store_pixel_color: {
367 MachineFunction &MF = DAG.getMachineFunction();
368 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
369 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
371 SDNode **OutputsMap = MFI->Outputs;
372 return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
373 RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
378 // default for switch(IntrinsicID)
381 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
384 case ISD::INTRINSIC_WO_CHAIN: {
385 unsigned IntrinsicID =
386 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
387 EVT VT = Op.getValueType();
388 DebugLoc DL = Op.getDebugLoc();
389 switch(IntrinsicID) {
390 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
391 case AMDGPUIntrinsic::R600_load_input: {
392 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
393 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
394 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
397 case AMDGPUIntrinsic::R600_interp_input: {
398 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
399 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
400 MachineSDNode *interp;
402 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
403 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
404 return DAG.getTargetExtractSubreg(
405 TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
406 DL, MVT::f32, SDValue(interp, 0));
410 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
411 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
412 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
413 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
414 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
415 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
417 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
418 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
419 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
420 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
421 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
422 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
424 return SDValue(interp, slot % 2);
427 case r600_read_ngroups_x:
428 return LowerImplicitParameter(DAG, VT, DL, 0);
429 case r600_read_ngroups_y:
430 return LowerImplicitParameter(DAG, VT, DL, 1);
431 case r600_read_ngroups_z:
432 return LowerImplicitParameter(DAG, VT, DL, 2);
433 case r600_read_global_size_x:
434 return LowerImplicitParameter(DAG, VT, DL, 3);
435 case r600_read_global_size_y:
436 return LowerImplicitParameter(DAG, VT, DL, 4);
437 case r600_read_global_size_z:
438 return LowerImplicitParameter(DAG, VT, DL, 5);
439 case r600_read_local_size_x:
440 return LowerImplicitParameter(DAG, VT, DL, 6);
441 case r600_read_local_size_y:
442 return LowerImplicitParameter(DAG, VT, DL, 7);
443 case r600_read_local_size_z:
444 return LowerImplicitParameter(DAG, VT, DL, 8);
446 case r600_read_tgid_x:
447 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
449 case r600_read_tgid_y:
450 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
452 case r600_read_tgid_z:
453 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
455 case r600_read_tidig_x:
456 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
458 case r600_read_tidig_y:
459 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
461 case r600_read_tidig_z:
462 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
465 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
468 } // end switch(Op.getOpcode())
472 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
473 SmallVectorImpl<SDValue> &Results,
474 SelectionDAG &DAG) const {
475 switch (N->getOpcode()) {
477 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
480 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
481 Results.push_back(SDValue(Node, 0));
482 Results.push_back(SDValue(Node, 1));
483 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
485 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
491 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
496 Op, DAG.getConstantFP(0.0f, MVT::f32),
497 DAG.getCondCode(ISD::SETNE)
501 SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
502 SDValue Chain = Op.getOperand(0);
503 SDValue CC = Op.getOperand(1);
504 SDValue LHS = Op.getOperand(2);
505 SDValue RHS = Op.getOperand(3);
506 SDValue JumpT = Op.getOperand(4);
510 if (LHS.getValueType() == MVT::i32) {
511 CmpValue = DAG.getNode(
516 DAG.getConstant(-1, MVT::i32),
517 DAG.getConstant(0, MVT::i32),
519 } else if (LHS.getValueType() == MVT::f32) {
520 CmpValue = DAG.getNode(
525 DAG.getConstantFP(1.0f, MVT::f32),
526 DAG.getConstantFP(0.0f, MVT::f32),
529 assert(0 && "Not valid type for br_cc");
531 Result = DAG.getNode(
532 AMDGPUISD::BRANCH_COND,
533 CmpValue.getDebugLoc(),
539 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
541 unsigned DwordOffset) const {
542 unsigned ByteOffset = DwordOffset * 4;
543 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
544 AMDGPUAS::PARAM_I_ADDRESS);
546 // We shouldn't be using an offset wider than 16-bits for implicit parameters.
547 assert(isInt<16>(ByteOffset));
549 return DAG.getLoad(VT, DL, DAG.getEntryNode(),
550 DAG.getConstant(ByteOffset, MVT::i32), // PTR
551 MachinePointerInfo(ConstantPointerNull::get(PtrType)),
552 false, false, false, 0);
555 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
556 DebugLoc DL = Op.getDebugLoc();
557 EVT VT = Op.getValueType();
559 return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
562 DAG.getNode(ISD::SUB, DL, VT,
563 DAG.getConstant(32, MVT::i32),
567 bool R600TargetLowering::isZero(SDValue Op) const {
568 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
569 return Cst->isNullValue();
570 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
571 return CstFP->isZero();
577 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
578 DebugLoc DL = Op.getDebugLoc();
579 EVT VT = Op.getValueType();
581 SDValue LHS = Op.getOperand(0);
582 SDValue RHS = Op.getOperand(1);
583 SDValue True = Op.getOperand(2);
584 SDValue False = Op.getOperand(3);
585 SDValue CC = Op.getOperand(4);
588 // LHS and RHS are guaranteed to be the same value type
589 EVT CompareVT = LHS.getValueType();
591 // Check if we can lower this to a native operation.
593 // Try to lower to a CND* instruction:
594 // CND* instructions requires RHS to be zero. Some SELECT_CC nodes that
595 // can be lowered to CND* instructions can also be lowered to SET*
596 // instructions. CND* instructions are cheaper, because they dont't
597 // require additional instructions to convert their result to the correct
598 // value type, so this check should be first.
599 if (isZero(LHS) || isZero(RHS)) {
600 SDValue Cond = (isZero(LHS) ? RHS : LHS);
601 SDValue Zero = (isZero(LHS) ? LHS : RHS);
602 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
603 if (CompareVT != VT) {
604 // Bitcast True / False to the correct types. This will end up being
605 // a nop, but it allows us to define only a single pattern in the
606 // .TD files for each CND* instruction rather than having to have
607 // one pattern for integer True/False and one for fp True/False
608 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
609 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
612 CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
625 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
633 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
636 DAG.getCondCode(CCOpcode));
637 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
640 // Try to lower to a SET* instruction:
641 // We need all the operands of SELECT_CC to have the same value type, so if
642 // necessary we need to change True and False to be the same type as LHS and
643 // RHS, and then convert the result of the select_cc back to the correct type.
645 // Move hardware True/False values to the correct operand.
646 if (isHWTrueValue(False) && isHWFalseValue(True)) {
647 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
648 std::swap(False, True);
649 CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
652 if (isHWTrueValue(True) && isHWFalseValue(False)) {
653 if (CompareVT != VT) {
654 if (VT == MVT::f32 && CompareVT == MVT::i32) {
655 SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
657 DAG.getConstant(-1, MVT::i32),
658 DAG.getConstant(0, MVT::i32),
660 // Convert integer values of true (-1) and false (0) to fp values of
661 // true (1.0f) and false (0.0f).
662 SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
663 DAG.getConstant(1, MVT::i32));
664 return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
665 } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
666 SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
668 DAG.getConstantFP(1.0f, MVT::f32),
669 DAG.getConstantFP(0.0f, MVT::f32),
671 // Convert fp values of true (1.0f) and false (0.0f) to integer values
672 // of true (-1) and false (0).
673 SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
674 return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
676 // I don't think there will be any other type pairings.
677 assert(!"Unhandled operand type parings in SELECT_CC");
680 // This SELECT_CC is already legal.
681 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
685 // Possible Min/Max pattern
686 SDValue MinMax = LowerMinMax(Op, DAG);
687 if (MinMax.getNode()) {
691 // If we make it this for it means we have no native instructions to handle
692 // this SELECT_CC, so we must lower it.
693 SDValue HWTrue, HWFalse;
695 if (CompareVT == MVT::f32) {
696 HWTrue = DAG.getConstantFP(1.0f, CompareVT);
697 HWFalse = DAG.getConstantFP(0.0f, CompareVT);
698 } else if (CompareVT == MVT::i32) {
699 HWTrue = DAG.getConstant(-1, CompareVT);
700 HWFalse = DAG.getConstant(0, CompareVT);
703 assert(!"Unhandled value type in LowerSELECT_CC");
706 // Lower this unsupported SELECT_CC into a combination of two supported
707 // SELECT_CC operations.
708 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
710 return DAG.getNode(ISD::SELECT_CC, DL, VT,
713 DAG.getCondCode(ISD::SETNE));
716 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
717 return DAG.getNode(ISD::SELECT_CC,
721 DAG.getConstant(0, MVT::i32),
724 DAG.getCondCode(ISD::SETNE));
727 SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
729 SDValue LHS = Op.getOperand(0);
730 SDValue RHS = Op.getOperand(1);
731 SDValue CC = Op.getOperand(2);
732 DebugLoc DL = Op.getDebugLoc();
733 assert(Op.getValueType() == MVT::i32);
734 if (LHS.getValueType() == MVT::i32) {
740 DAG.getConstant(-1, MVT::i32),
741 DAG.getConstant(0, MVT::i32),
743 } else if (LHS.getValueType() == MVT::f32) {
749 DAG.getConstantFP(1.0f, MVT::f32),
750 DAG.getConstantFP(0.0f, MVT::f32),
758 assert(0 && "Not valid type for set_cc");
764 DAG.getConstant(1, MVT::i32),
769 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
770 DebugLoc DL = Op.getDebugLoc();
771 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
772 SDValue Chain = Op.getOperand(0);
773 SDValue Value = Op.getOperand(1);
774 SDValue Ptr = Op.getOperand(2);
776 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
777 Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
778 // Convert pointer from byte address to dword address.
779 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
780 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
781 Ptr, DAG.getConstant(2, MVT::i32)));
783 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
784 assert(!"Truncated and indexed stores not supported yet");
786 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
793 // return (512 + (kc_bank << 12)
795 ConstantAddressBlock(unsigned AddressSpace) {
796 switch (AddressSpace) {
797 case AMDGPUAS::CONSTANT_BUFFER_0:
799 case AMDGPUAS::CONSTANT_BUFFER_1:
801 case AMDGPUAS::CONSTANT_BUFFER_2:
802 return 512 + 4096 * 2;
803 case AMDGPUAS::CONSTANT_BUFFER_3:
804 return 512 + 4096 * 3;
805 case AMDGPUAS::CONSTANT_BUFFER_4:
806 return 512 + 4096 * 4;
807 case AMDGPUAS::CONSTANT_BUFFER_5:
808 return 512 + 4096 * 5;
809 case AMDGPUAS::CONSTANT_BUFFER_6:
810 return 512 + 4096 * 6;
811 case AMDGPUAS::CONSTANT_BUFFER_7:
812 return 512 + 4096 * 7;
813 case AMDGPUAS::CONSTANT_BUFFER_8:
814 return 512 + 4096 * 8;
815 case AMDGPUAS::CONSTANT_BUFFER_9:
816 return 512 + 4096 * 9;
817 case AMDGPUAS::CONSTANT_BUFFER_10:
818 return 512 + 4096 * 10;
819 case AMDGPUAS::CONSTANT_BUFFER_11:
820 return 512 + 4096 * 11;
821 case AMDGPUAS::CONSTANT_BUFFER_12:
822 return 512 + 4096 * 12;
823 case AMDGPUAS::CONSTANT_BUFFER_13:
824 return 512 + 4096 * 13;
825 case AMDGPUAS::CONSTANT_BUFFER_14:
826 return 512 + 4096 * 14;
827 case AMDGPUAS::CONSTANT_BUFFER_15:
828 return 512 + 4096 * 15;
834 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
836 EVT VT = Op.getValueType();
837 DebugLoc DL = Op.getDebugLoc();
838 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
839 SDValue Chain = Op.getOperand(0);
840 SDValue Ptr = Op.getOperand(1);
843 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
844 if (ConstantBlock > -1) {
846 if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
847 dyn_cast<Constant>(LoadNode->getSrcValue())) {
849 for (unsigned i = 0; i < 4; i++) {
850 // We want Const position encoded with the following formula :
851 // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
852 // const_index is Ptr computed by llvm using an alignment of 16.
853 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
854 // then div by 4 at the ISel step
855 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
856 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
857 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
859 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
861 // non constant ptr cant be folded, keeps it as a v4f32 load
862 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
863 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
867 if (!VT.isVector()) {
868 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
869 DAG.getConstant(0, MVT::i32));
872 SDValue MergedValues[2] = {
876 return DAG.getMergeValues(MergedValues, 2, DL);
882 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
883 SelectionDAG &DAG) const {
884 DebugLoc DL = Op.getDebugLoc();
885 EVT VT = Op.getValueType();
886 SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
887 SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
888 return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
891 /// XXX Only kernel functions are supported, so we can assume for now that
892 /// every function is a kernel function, but in the future we should use
893 /// separate calling conventions for kernel and non-kernel functions.
894 SDValue R600TargetLowering::LowerFormalArguments(
896 CallingConv::ID CallConv,
898 const SmallVectorImpl<ISD::InputArg> &Ins,
899 DebugLoc DL, SelectionDAG &DAG,
900 SmallVectorImpl<SDValue> &InVals) const {
901 unsigned ParamOffsetBytes = 36;
902 Function::const_arg_iterator FuncArg =
903 DAG.getMachineFunction().getFunction()->arg_begin();
904 for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
906 Type *ArgType = FuncArg->getType();
907 unsigned ArgSizeInBits = ArgType->isPointerTy() ?
908 32 : ArgType->getPrimitiveSizeInBits();
909 unsigned ArgBytes = ArgSizeInBits >> 3;
911 if (ArgSizeInBits < VT.getSizeInBits()) {
912 assert(!ArgType->isFloatTy() &&
913 "Extending floating point arguments not supported yet");
914 ArgVT = MVT::getIntegerVT(ArgSizeInBits);
918 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
919 AMDGPUAS::PARAM_I_ADDRESS);
920 SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
921 DAG.getConstant(ParamOffsetBytes, MVT::i32),
922 MachinePointerInfo(new Argument(PtrTy)),
923 ArgVT, false, false, ArgBytes);
924 InVals.push_back(Arg);
925 ParamOffsetBytes += ArgBytes;
930 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
931 if (!VT.isVector()) return MVT::i32;
932 return VT.changeVectorElementTypeToInteger();
935 //===----------------------------------------------------------------------===//
936 // Custom DAG Optimizations
937 //===----------------------------------------------------------------------===//
939 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
940 DAGCombinerInfo &DCI) const {
941 SelectionDAG &DAG = DCI.DAG;
943 switch (N->getOpcode()) {
944 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
945 case ISD::FP_ROUND: {
946 SDValue Arg = N->getOperand(0);
947 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
948 return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
953 // Extract_vec (Build_vector) generated by custom lowering
954 // also needs to be customly combined
955 case ISD::EXTRACT_VECTOR_ELT: {
956 SDValue Arg = N->getOperand(0);
957 if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
958 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
959 unsigned Element = Const->getZExtValue();
960 return Arg->getOperand(Element);
963 if (Arg.getOpcode() == ISD::BITCAST &&
964 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
965 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
966 unsigned Element = Const->getZExtValue();
967 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
968 Arg->getOperand(0).getOperand(Element));