1 //===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //==-----------------------------------------------------------------------===//
11 /// \brief TargetLowering functions borrowed from AMDIL.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPURegisterInfo.h"
17 #include "AMDILDevices.h"
18 #include "AMDILIntrinsicInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "llvm/CallingConv.h"
21 #include "llvm/CodeGen/MachineFrameInfo.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/PseudoSourceValue.h"
24 #include "llvm/CodeGen/SelectionDAG.h"
25 #include "llvm/CodeGen/SelectionDAGNodes.h"
26 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
27 #include "llvm/DerivedTypes.h"
28 #include "llvm/Instructions.h"
29 #include "llvm/Intrinsics.h"
30 #include "llvm/Support/raw_ostream.h"
31 #include "llvm/Target/TargetInstrInfo.h"
32 #include "llvm/Target/TargetOptions.h"
35 //===----------------------------------------------------------------------===//
36 // Calling Convention Implementation
37 //===----------------------------------------------------------------------===//
38 #include "AMDGPUGenCallingConv.inc"
40 //===----------------------------------------------------------------------===//
41 // TargetLowering Implementation Help Functions End
42 //===----------------------------------------------------------------------===//
44 //===----------------------------------------------------------------------===//
45 // TargetLowering Class Implementation Begins
46 //===----------------------------------------------------------------------===//
47 void AMDGPUTargetLowering::InitAMDILLowering() {
91 size_t NumTypes = sizeof(types) / sizeof(*types);
92 size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
93 size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
94 size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
96 const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
97 // These are the current register classes that are
100 for (unsigned int x = 0; x < NumTypes; ++x) {
101 MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
103 //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
104 // We cannot sextinreg, expand to shifts
105 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
106 setOperationAction(ISD::SUBE, VT, Expand);
107 setOperationAction(ISD::SUBC, VT, Expand);
108 setOperationAction(ISD::ADDE, VT, Expand);
109 setOperationAction(ISD::ADDC, VT, Expand);
110 setOperationAction(ISD::BRCOND, VT, Custom);
111 setOperationAction(ISD::BR_JT, VT, Expand);
112 setOperationAction(ISD::BRIND, VT, Expand);
113 // TODO: Implement custom UREM/SREM routines
114 setOperationAction(ISD::SREM, VT, Expand);
115 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
116 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
117 if (VT != MVT::i64 && VT != MVT::v2i64) {
118 setOperationAction(ISD::SDIV, VT, Custom);
121 for (unsigned int x = 0; x < NumFloatTypes; ++x) {
122 MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
124 // IL does not have these operations for floating point types
125 setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
126 setOperationAction(ISD::SETOLT, VT, Expand);
127 setOperationAction(ISD::SETOGE, VT, Expand);
128 setOperationAction(ISD::SETOGT, VT, Expand);
129 setOperationAction(ISD::SETOLE, VT, Expand);
130 setOperationAction(ISD::SETULT, VT, Expand);
131 setOperationAction(ISD::SETUGE, VT, Expand);
132 setOperationAction(ISD::SETUGT, VT, Expand);
133 setOperationAction(ISD::SETULE, VT, Expand);
136 for (unsigned int x = 0; x < NumIntTypes; ++x) {
137 MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
139 // GPU also does not have divrem function for signed or unsigned
140 setOperationAction(ISD::SDIVREM, VT, Expand);
142 // GPU does not have [S|U]MUL_LOHI functions as a single instruction
143 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
144 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
146 // GPU doesn't have a rotl, rotr, or byteswap instruction
147 setOperationAction(ISD::ROTR, VT, Expand);
148 setOperationAction(ISD::BSWAP, VT, Expand);
150 // GPU doesn't have any counting operators
151 setOperationAction(ISD::CTPOP, VT, Expand);
152 setOperationAction(ISD::CTTZ, VT, Expand);
153 setOperationAction(ISD::CTLZ, VT, Expand);
156 for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
157 MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
159 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
160 setOperationAction(ISD::SDIVREM, VT, Expand);
161 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
162 // setOperationAction(ISD::VSETCC, VT, Expand);
163 setOperationAction(ISD::SELECT_CC, VT, Expand);
166 if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
167 setOperationAction(ISD::MULHU, MVT::i64, Expand);
168 setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
169 setOperationAction(ISD::MULHS, MVT::i64, Expand);
170 setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
171 setOperationAction(ISD::ADD, MVT::v2i64, Expand);
172 setOperationAction(ISD::SREM, MVT::v2i64, Expand);
173 setOperationAction(ISD::Constant , MVT::i64 , Legal);
174 setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
175 setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
176 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
177 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
178 setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
180 if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
181 // we support loading/storing v2f64 but not operations on the type
182 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
183 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
184 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
185 setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
186 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
187 setOperationAction(ISD::ConstantFP , MVT::f64 , Legal);
188 // We want to expand vector conversions into their scalar
190 setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
191 setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
192 setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
193 setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
194 setOperationAction(ISD::FABS, MVT::f64, Expand);
195 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
197 // TODO: Fix the UDIV24 algorithm so it works for these
198 // types correctly. This needs vector comparisons
199 // for this to work correctly.
200 setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
201 setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
202 setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
203 setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
204 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
205 setOperationAction(ISD::SUBC, MVT::Other, Expand);
206 setOperationAction(ISD::ADDE, MVT::Other, Expand);
207 setOperationAction(ISD::ADDC, MVT::Other, Expand);
208 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
209 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
210 setOperationAction(ISD::BRIND, MVT::Other, Expand);
211 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
214 // Use the default implementation.
215 setOperationAction(ISD::ConstantFP , MVT::f32 , Legal);
216 setOperationAction(ISD::Constant , MVT::i32 , Legal);
218 setSchedulingPreference(Sched::RegPressure);
219 setPow2DivIsCheap(false);
220 setPrefLoopAlignment(16);
221 setSelectIsExpensive(true);
222 setJumpIsExpensive(true);
224 maxStoresPerMemcpy = 4096;
225 maxStoresPerMemmove = 4096;
226 maxStoresPerMemset = 4096;
231 AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
232 const CallInst &I, unsigned Intrinsic) const {
236 // The backend supports 32 and 64 bit floating point immediates
238 AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
239 if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
240 || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
248 AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
249 if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
250 || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
258 // isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
259 // be zero. Op is expected to be a target specific node. Used by DAG
263 AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
267 const SelectionDAG &DAG,
268 unsigned Depth) const {
271 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
272 switch (Op.getOpcode()) {
275 DAG.ComputeMaskedBits(
281 DAG.ComputeMaskedBits(
286 assert((KnownZero & KnownOne) == 0
287 && "Bits known to be one AND zero?");
288 assert((KnownZero2 & KnownOne2) == 0
289 && "Bits known to be one AND zero?");
290 // Only known if known in both the LHS and RHS
291 KnownOne &= KnownOne2;
292 KnownZero &= KnownZero2;
297 //===----------------------------------------------------------------------===//
298 // Other Lowering Hooks
299 //===----------------------------------------------------------------------===//
302 AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
303 EVT OVT = Op.getValueType();
305 if (OVT.getScalarType() == MVT::i64) {
306 DST = LowerSDIV64(Op, DAG);
307 } else if (OVT.getScalarType() == MVT::i32) {
308 DST = LowerSDIV32(Op, DAG);
309 } else if (OVT.getScalarType() == MVT::i16
310 || OVT.getScalarType() == MVT::i8) {
311 DST = LowerSDIV24(Op, DAG);
313 DST = SDValue(Op.getNode(), 0);
319 AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
320 EVT OVT = Op.getValueType();
322 if (OVT.getScalarType() == MVT::i64) {
323 DST = LowerSREM64(Op, DAG);
324 } else if (OVT.getScalarType() == MVT::i32) {
325 DST = LowerSREM32(Op, DAG);
326 } else if (OVT.getScalarType() == MVT::i16) {
327 DST = LowerSREM16(Op, DAG);
328 } else if (OVT.getScalarType() == MVT::i8) {
329 DST = LowerSREM8(Op, DAG);
331 DST = SDValue(Op.getNode(), 0);
337 AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
338 SDValue Data = Op.getOperand(0);
339 VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
340 DebugLoc DL = Op.getDebugLoc();
341 EVT DVT = Data.getValueType();
342 EVT BVT = BaseType->getVT();
343 unsigned baseBits = BVT.getScalarType().getSizeInBits();
344 unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
345 unsigned shiftBits = srcBits - baseBits;
347 // If the op is less than 32 bits, then it needs to extend to 32bits
348 // so it can properly keep the upper bits valid.
349 EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
350 Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
351 shiftBits = 32 - baseBits;
354 SDValue Shift = DAG.getConstant(shiftBits, DVT);
355 // Shift left by 'Shift' bits.
356 Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
357 // Signed shift Right by 'Shift' bits.
358 Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
360 // Once the sign extension is done, the op needs to be converted to
361 // its original type.
362 Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
367 AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
368 int iSize = (size * numEle);
369 int vEle = (iSize >> ((size == 64) ? 6 : 5));
375 return EVT(MVT::i64);
377 return EVT(MVT::getVectorVT(MVT::i64, vEle));
381 return EVT(MVT::i32);
383 return EVT(MVT::getVectorVT(MVT::i32, vEle));
389 AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
390 SDValue Chain = Op.getOperand(0);
391 SDValue Cond = Op.getOperand(1);
392 SDValue Jump = Op.getOperand(2);
394 Result = DAG.getNode(
395 AMDGPUISD::BRANCH_COND,
403 AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
404 DebugLoc DL = Op.getDebugLoc();
405 EVT OVT = Op.getValueType();
406 SDValue LHS = Op.getOperand(0);
407 SDValue RHS = Op.getOperand(1);
410 if (!OVT.isVector()) {
413 } else if (OVT.getVectorNumElements() == 2) {
416 } else if (OVT.getVectorNumElements() == 4) {
420 unsigned bitsize = OVT.getScalarType().getSizeInBits();
421 // char|short jq = ia ^ ib;
422 SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
424 // jq = jq >> (bitsize - 2)
425 jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
428 jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
431 jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
433 // int ia = (int)LHS;
434 SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
437 SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
439 // float fa = (float)ia;
440 SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
442 // float fb = (float)ib;
443 SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
445 // float fq = native_divide(fa, fb);
446 SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
449 fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
451 // float fqneg = -fq;
452 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
454 // float fr = mad(fqneg, fb, fa);
455 SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
458 SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
461 fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
464 fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
466 // int cv = fr >= fb;
468 if (INTTY == MVT::i32) {
469 cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
471 cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
473 // jq = (cv ? jq : 0);
474 jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
475 DAG.getConstant(0, OVT));
477 iq = DAG.getSExtOrTrunc(iq, DL, OVT);
478 iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
483 AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
484 DebugLoc DL = Op.getDebugLoc();
485 EVT OVT = Op.getValueType();
486 SDValue LHS = Op.getOperand(0);
487 SDValue RHS = Op.getOperand(1);
488 // The LowerSDIV32 function generates equivalent to the following IL.
498 // ixor r10, r10, r11
509 SDValue r10 = DAG.getSelectCC(DL,
510 r0, DAG.getConstant(0, OVT),
511 DAG.getConstant(-1, MVT::i32),
512 DAG.getConstant(0, MVT::i32),
516 SDValue r11 = DAG.getSelectCC(DL,
517 r1, DAG.getConstant(0, OVT),
518 DAG.getConstant(-1, MVT::i32),
519 DAG.getConstant(0, MVT::i32),
523 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
526 r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
529 r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
532 r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
535 r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
537 // ixor r10, r10, r11
538 r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
541 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
544 SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
549 AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
550 return SDValue(Op.getNode(), 0);
554 AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
555 DebugLoc DL = Op.getDebugLoc();
556 EVT OVT = Op.getValueType();
557 MVT INTTY = MVT::i32;
558 if (OVT == MVT::v2i8) {
560 } else if (OVT == MVT::v4i8) {
563 SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
564 SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
565 LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
566 LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
571 AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
572 DebugLoc DL = Op.getDebugLoc();
573 EVT OVT = Op.getValueType();
574 MVT INTTY = MVT::i32;
575 if (OVT == MVT::v2i16) {
577 } else if (OVT == MVT::v4i16) {
580 SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
581 SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
582 LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
583 LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
588 AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
589 DebugLoc DL = Op.getDebugLoc();
590 EVT OVT = Op.getValueType();
591 SDValue LHS = Op.getOperand(0);
592 SDValue RHS = Op.getOperand(1);
593 // The LowerSREM32 function generates equivalent to the following IL.
615 SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
618 SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
621 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
624 r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
627 r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
630 r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
633 SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
636 r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
639 r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
642 r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
645 SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
650 AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
651 return SDValue(Op.getNode(), 0);