set(LLVM_ALL_TARGETS
AArch64
+ AMDGPU
ARM
BPF
CppBackend
MSP430
NVPTX
PowerPC
- R600
Sparc
SystemZ
X86
fi
dnl List all possible targets
-ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF"
AC_SUBST(ALL_TARGETS,$ALL_TARGETS)
dnl Allow specific targets to be specified for building (or not)
hexagon) TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
nvptx) TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
systemz) TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;;
- r600) TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;;
+ amdgpu) ;&
+ r600) TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;;
host) case "$llvm_cv_target_arch" in
x86) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
fi
-ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF"
ALL_TARGETS=$ALL_TARGETS
hexagon) TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
nvptx) TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
systemz) TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;;
- r600) TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;;
+ amdgpu) ;&
+ r600) TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;;
host) case "$llvm_cv_target_arch" in
x86) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
--- /dev/null
+==============================
+User Guide for AMDGPU Back-end
+==============================
+
+Introduction
+============
+
+The AMDGPU back-end provides ISA code generation for AMD GPUs, starting with
+the R600 family up until the current Volcanic Islands (GCN Gen 3).
+
+
+Assembler
+=========
+
+The assembler is currently considered experimental.
+
+For syntax examples look in test/MC/AMDGPU.
+
+Below some of the currently supported features (modulo bugs). These
+all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands
+are also supported but may be missing some instructions and have more bugs:
+
+DS Instructions
+---------------
+All DS instructions are supported.
+
+FLAT Instructions
+------------------
+These instructions are only present in the Sea Islands and Volcanic Islands
+instruction set. All FLAT instructions are supported for these architectures
+
+MUBUF Instructions
+------------------
+All non-atomic MUBUF instructions are supported.
+
+SMRD Instructions
+-----------------
+Only the s_load_dword* SMRD instructions are supported.
+
+SOP1 Instructions
+-----------------
+All SOP1 instructions are supported.
+
+SOP2 Instructions
+-----------------
+All SOP2 instructions are supported.
+
+SOPC Instructions
+-----------------
+All SOPC instructions are supported.
+
+SOPP Instructions
+-----------------
+
+Unless otherwise mentioned, all SOPP instructions that have one or more
+operands accept integer operands only. No verification is performed
+on the operands, so it is up to the programmer to be familiar with the
+range or acceptable values.
+
+s_waitcnt
+^^^^^^^^^
+
+s_waitcnt accepts named arguments to specify which memory counter(s) to
+wait for.
+
+.. code-block:: nasm
+
+ // Wait for all counters to be 0
+ s_waitcnt 0
+
+ // Equivalent to s_waitcnt 0. Counter names can also be delimited by
+ // '&' or ','.
+ s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
+
+ // Wait for vmcnt counter to be 1.
+ s_waitcnt vmcnt(1)
+
+VOP1, VOP2, VOP3, VOPC Instructions
+-----------------------------------
+
+All 32-bit and 64-bit encodings should work.
+
+The assembler will automatically detect which encoding size to use for
+VOP1, VOP2, and VOPC instructions based on the operands. If you want to force
+a specific encoding size, you can add an _e32 (for 32-bit encoding) or
+_e64 (for 64-bit encoding) suffix to the instruction. Most, but not all
+instructions support an explicit suffix. These are all valid assembly
+strings:
+
+.. code-block:: nasm
+
+ v_mul_i32_i24 v1, v2, v3
+ v_mul_i32_i24_e32 v1, v2, v3
+ v_mul_i32_i24_e64 v1, v2, v3
* `PowerPC64 alignment of long doubles (from GCC) <http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00997.html>`_
* `Long branch stubs for powerpc64-linux (from binutils) <http://sources.redhat.com/ml/binutils/2002-04/msg00573.html>`_
-R600
-----
+AMDGPU
+------
* `AMD R6xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R600_Instruction_Set_Architecture.pdf>`_
* `AMD R7xx shader ISA <http://developer.amd.com/wordpress/media/2012/10/R700-Family_Instruction_Set_Architecture.pdf>`_
| | as ``LLVM_ALL_TARGETS``, and can be set to include |
| | out-of-tree targets. The default value includes: |
| | ``AArch64, ARM, CppBackend, Hexagon, |
-| | Mips, MSP430, NVPTX, PowerPC, R600, Sparc, |
+| | Mips, MSP430, NVPTX, PowerPC, AMDGPU, Sparc, |
| | SystemZ, X86, XCore``. |
+-------------------------+----------------------------------------------------+
| LLVM_ENABLE_DOXYGEN | Build doxygen-based documentation from the source |
+++ /dev/null
-============================
-User Guide for R600 Back-end
-============================
-
-Introduction
-============
-
-The R600 back-end provides ISA code generation for AMD GPUs, starting with
-the R600 family up until the current Volcanic Islands (GCN Gen 3).
-
-
-Assembler
-=========
-
-The assembler is currently considered experimental.
-
-For syntax examples look in test/MC/R600.
-
-Below some of the currently supported features (modulo bugs). These
-all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands
-are also supported but may be missing some instructions and have more bugs:
-
-DS Instructions
----------------
-All DS instructions are supported.
-
-FLAT Instructions
-------------------
-These instructions are only present in the Sea Islands and Volcanic Islands
-instruction set. All FLAT instructions are supported for these architectures
-
-MUBUF Instructions
-------------------
-All non-atomic MUBUF instructions are supported.
-
-SMRD Instructions
------------------
-Only the s_load_dword* SMRD instructions are supported.
-
-SOP1 Instructions
------------------
-All SOP1 instructions are supported.
-
-SOP2 Instructions
------------------
-All SOP2 instructions are supported.
-
-SOPC Instructions
------------------
-All SOPC instructions are supported.
-
-SOPP Instructions
------------------
-
-Unless otherwise mentioned, all SOPP instructions that have one or more
-operands accept integer operands only. No verification is performed
-on the operands, so it is up to the programmer to be familiar with the
-range or acceptable values.
-
-s_waitcnt
-^^^^^^^^^
-
-s_waitcnt accepts named arguments to specify which memory counter(s) to
-wait for.
-
-.. code-block:: nasm
-
- // Wait for all counters to be 0
- s_waitcnt 0
-
- // Equivalent to s_waitcnt 0. Counter names can also be delimited by
- // '&' or ','.
- s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
-
- // Wait for vmcnt counter to be 1.
- s_waitcnt vmcnt(1)
-
-VOP1, VOP2, VOP3, VOPC Instructions
------------------------------------
-
-All 32-bit and 64-bit encodings should work.
-
-The assembler will automatically detect which encoding size to use for
-VOP1, VOP2, and VOPC instructions based on the operands. If you want to force
-a specific encoding size, you can add an _e32 (for 32-bit encoding) or
-_e64 (for 64-bit encoding) suffix to the instruction. Most, but not all
-instructions support an explicit suffix. These are all valid assembly
-strings:
-
-.. code-block:: nasm
-
- v_mul_i32_i24 v1, v2, v3
- v_mul_i32_i24_e32 v1, v2, v3
- v_mul_i32_i24_e64 v1, v2, v3
WritingAnLLVMPass
HowToUseAttributes
NVPTXUsage
- R600Usage
+ AMDGPUUsage
StackMaps
InAlloca
BigEndianNEON
:doc:`NVPTXUsage`
This document describes using the NVPTX back-end to compile GPU kernels.
-:doc:`R600Usage`
- This document describes how to use the R600 back-end.
+:doc:`AMDGPUUsage`
+ This document describes how to use the AMDGPU back-end.
:doc:`StackMaps`
LLVM support for mapping instruction addresses to the location of
--- /dev/null
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
+#define LLVM_LIB_TARGET_R600_AMDGPU_H
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUInstrPrinter;
+class AMDGPUSubtarget;
+class AMDGPUTargetMachine;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+// R600 Passes
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600TextureIntrinsicsReplacer();
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers();
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
+FunctionPass *createR600Packetizer(TargetMachine &tm);
+FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass();
+
+// SI Passes
+FunctionPass *createSITypeRewriter();
+FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIFixControlFlowLiveIntervalsPass();
+FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRLiveRangesPass();
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIPrepareScratchRegs();
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
+
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
+void initializeSILoadStoreOptimizerPass(PassRegistry &);
+extern char &SILoadStoreOptimizerID;
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
+Pass *createAMDGPUStructurizeCFGPass();
+FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+ModulePass *createAMDGPUAlwaysInlinePass();
+
+void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
+extern char &SIFixControlFlowLiveIntervalsID;
+
+void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
+extern char &SIFixSGPRLiveRangesID;
+
+
+extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
+
+namespace AMDGPU {
+enum TargetIndex {
+ TI_CONSTDATA_START,
+ TI_SCRATCH_RSRC_DWORD0,
+ TI_SCRATCH_RSRC_DWORD1,
+ TI_SCRATCH_RSRC_DWORD2,
+ TI_SCRATCH_RSRC_DWORD3
+};
+}
+
+#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
+
+} // End namespace llvm
+
+namespace ShaderType {
+ enum Type {
+ PIXEL = 0,
+ VERTEX = 1,
+ GEOMETRY = 2,
+ COMPUTE = 3
+ };
+}
+
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a separate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces : unsigned {
+ PRIVATE_ADDRESS = 0, ///< Address space for private memory.
+ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
+ CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+ LOCAL_ADDRESS = 3, ///< Address space for local memory.
+ FLAT_ADDRESS = 4, ///< Address space for flat memory.
+ REGION_ADDRESS = 5, ///< Address space for region memory.
+ PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
+ PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+
+ // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this
+ // order to be able to dynamically index a constant buffer, for example:
+ //
+ // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+ CONSTANT_BUFFER_0 = 8,
+ CONSTANT_BUFFER_1 = 9,
+ CONSTANT_BUFFER_2 = 10,
+ CONSTANT_BUFFER_3 = 11,
+ CONSTANT_BUFFER_4 = 12,
+ CONSTANT_BUFFER_5 = 13,
+ CONSTANT_BUFFER_6 = 14,
+ CONSTANT_BUFFER_7 = 15,
+ CONSTANT_BUFFER_8 = 16,
+ CONSTANT_BUFFER_9 = 17,
+ CONSTANT_BUFFER_10 = 18,
+ CONSTANT_BUFFER_11 = 19,
+ CONSTANT_BUFFER_12 = 20,
+ CONSTANT_BUFFER_13 = 21,
+ CONSTANT_BUFFER_14 = 22,
+ CONSTANT_BUFFER_15 = 23,
+ ADDRESS_NONE = 24, ///< Address space for unknown memory.
+ LAST_ADDRESS = ADDRESS_NONE,
+
+ // Some places use this if the address space can't be determined.
+ UNKNOWN_ADDRESS_SPACE = ~0u
+};
+
+} // namespace AMDGPUAS
+
+#endif
--- /dev/null
+//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+// Debugging Features
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+ "DumpCode",
+ "true",
+ "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
+ "DumpCode",
+ "true",
+ "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
+ "EnableIRStructurizer",
+ "false",
+ "Disable IR Structurizer">;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+ "EnablePromoteAlloca",
+ "true",
+ "Enable promote alloca pass">;
+
+// Target features
+
+def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
+ "EnableIfCvt",
+ "false",
+ "Disable the if conversion pass">;
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+ "FP64",
+ "true",
+ "Enable double precision operations">;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+ "FP64Denormals",
+ "true",
+ "Enable double precision denormal handling",
+ [FeatureFP64]>;
+
+def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
+ "FastFMAF32",
+ "true",
+ "Assuming f32 fma is at least as fast as mul + add",
+ []>;
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+ "FP32Denormals",
+ "true",
+ "Enable single precision denormal handling">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+ "Is64bit",
+ "true",
+ "Specify if 64-bit addressing should be used">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+ "R600ALUInst",
+ "false",
+ "Older version of ALU instructions encoding">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+ "HasVertexCache",
+ "true",
+ "Specify use of dedicated vertex cache">;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+ "CaymanISA",
+ "true",
+ "Use Cayman ISA">;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+ "CFALUBug",
+ "true",
+ "GPU has CF_ALU bug">;
+
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+ "EnableLoadStoreOpt",
+ "true",
+ "Enable SI load/store optimizer pass">;
+
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+ "FlatAddressSpace",
+ "true",
+ "Support flat address space">;
+
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+ "EnableVGPRSpilling",
+ "true",
+ "Enable spilling of VGPRs to scratch memory">;
+
+def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
+ "SGPRInitBug",
+ "true",
+ "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
+
+class SubtargetFeatureFetchLimit <string Value> :
+ SubtargetFeature <"fetch"#Value,
+ "TexVTXClauseSize",
+ Value,
+ "Limit the maximum number of fetches in a clause to "#Value>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+ "wavefrontsize"#Value,
+ "WavefrontSize",
+ !cast<string>(Value),
+ "The number of threads per wavefront">;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
+ "ldsbankcount"#Value,
+ "LDSBankCount",
+ !cast<string>(Value),
+ "The number of LDS banks per compute unit.">;
+
+def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
+def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+ "localmemorysize"#Value,
+ "LocalMemorySize",
+ !cast<string>(Value),
+ "The size of local memory in bytes">;
+
+def FeatureGCN : SubtargetFeature<"gcn",
+ "IsGCN",
+ "true",
+ "GCN or newer GPU">;
+
+def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
+ "GCN1Encoding",
+ "true",
+ "Encoding format for SI and CI">;
+
+def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
+ "GCN3Encoding",
+ "true",
+ "Encoding format for VI">;
+
+def FeatureCIInsts : SubtargetFeature<"ci-insts",
+ "CIInsts",
+ "true",
+ "Additional intstructions for CI+">;
+
+// Dummy feature used to disable assembler instructions.
+def FeatureDisable : SubtargetFeature<"",
+ "FeatureDisable","true",
+ "Dummy feature to disable assembler"
+ " instructions">;
+
+class SubtargetFeatureGeneration <string Value,
+ list<SubtargetFeature> Implies> :
+ SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+ Value#" GPU generation", Implies>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+ [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+ [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+ [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+ [FeatureFetchLimit16, FeatureWavefrontSize64,
+ FeatureLocalMemorySize32768]
+>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
+ FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+ FeatureLDSBankCount32]>;
+
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+ FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
+ FeatureGCN1Encoding, FeatureCIInsts]>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+ FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
+
+//===----------------------------------------------------------------------===//
+
+def AMDGPUInstrInfo : InstrInfo {
+ let guessInstructionProperties = 1;
+ let noNamedPositionallyEncodedOperands = 1;
+}
+
+def AMDGPUAsmParser : AsmParser {
+ // Some of the R600 registers have the same name, so this crashes.
+ // For example T0_XYZW and T0_XY both have the asm name T0.
+ let ShouldEmitMatchRegisterName = 0;
+}
+
+def AMDGPU : Target {
+ // Pull in Instruction Info:
+ let InstructionSet = AMDGPUInstrInfo;
+ let AssemblyParsers = [AMDGPUAsmParser];
+}
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+def TruePredicate : Predicate<"true">;
+def isSICI : Predicate<
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>, AssemblerPredicate<"FeatureGCN1Encoding">;
+
+class PredicateControl {
+ Predicate SubtargetPredicate;
+ Predicate SIAssemblerPredicate = isSICI;
+ list<Predicate> AssemblerPredicates = [];
+ Predicate AssemblerPredicate = TruePredicate;
+ list<Predicate> OtherPredicates = [];
+ list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
+ AssemblerPredicates,
+ OtherPredicates);
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
--- /dev/null
+//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass marks all internal functions as always_inline and creates
+/// duplicates of all other functions a marks the duplicates as always_inline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAlwaysInline : public ModulePass {
+
+ static char ID;
+
+public:
+ AMDGPUAlwaysInline() : ModulePass(ID) { }
+ bool runOnModule(Module &M) override;
+ const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
+};
+
+} // End anonymous namespace
+
+char AMDGPUAlwaysInline::ID = 0;
+
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+
+ std::vector<Function*> FuncsToClone;
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ Function &F = *I;
+ if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
+ !F.hasFnAttribute(Attribute::NoInline))
+ FuncsToClone.push_back(&F);
+ }
+
+ for (Function *F : FuncsToClone) {
+ ValueToValueMapTy VMap;
+ Function *NewFunc = CloneFunction(F, VMap, false);
+ NewFunc->setLinkage(GlobalValue::InternalLinkage);
+ F->getParent()->getFunctionList().push_back(NewFunc);
+ F->replaceAllUsesWith(NewFunc);
+ }
+
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ Function &F = *I;
+ if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
+ F.addFnAttr(Attribute::AlwaysInline);
+ }
+ }
+ return false;
+}
+
+ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
+ return new AMDGPUAlwaysInline();
+}
--- /dev/null
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code. When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUAsmPrinter.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+//
+// FIXME: It seems some instructions do not support single precision denormals
+// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
+// and sin_f32, cos_f32 on most parts).
+
+// We want to use these instructions, and using fp32 denormals also causes
+// instructions to run at the double precision rate for the device so it's
+// probably best to just report no single precision denormals.
+static uint32_t getFPMode(const MachineFunction &F) {
+ const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
+ // TODO: Is there any real use for the flush in only / flush out only modes?
+
+ uint32_t FP32Denormals =
+ ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ uint32_t FP64Denormals =
+ ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_DENORM_MODE_SP(FP32Denormals) |
+ FP_DENORM_MODE_DP(FP64Denormals);
+}
+
+static AsmPrinter *
+createAMDGPUAsmPrinterPass(TargetMachine &tm,
+ std::unique_ptr<MCStreamer> &&Streamer) {
+ return new AMDGPUAsmPrinter(tm, std::move(Streamer));
+}
+
+extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
+ TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
+}
+
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)) {}
+
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+
+ // This label is used to mark the end of the .text section.
+ const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+ OutStreamer->SwitchSection(TLOF.getTextSection());
+ MCSymbol *EndOfTextLabel =
+ OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+ OutStreamer->EmitLabel(EndOfTextLabel);
+}
+
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+ // The starting address of all shader programs must be 256 bytes aligned.
+ MF.setAlignment(8);
+
+ SetupMachineFunction(MF);
+
+ MCContext &Context = getObjFileLowering().getContext();
+ MCSectionELF *ConfigSection =
+ Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(ConfigSection);
+
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ SIProgramInfo KernelInfo;
+ if (STM.isAmdHsaOS()) {
+ getSIProgramInfo(KernelInfo, MF);
+ EmitAmdKernelCodeT(MF, KernelInfo);
+ OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1));
+ } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ getSIProgramInfo(KernelInfo, MF);
+ EmitProgramInfoSI(MF, KernelInfo);
+ } else {
+ EmitProgramInfoR600(MF);
+ }
+
+ DisasmLines.clear();
+ HexLines.clear();
+ DisasmLineMaxLen = 0;
+
+ EmitFunctionBody();
+
+ if (isVerbose()) {
+ MCSectionELF *CommentSection =
+ Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(CommentSection);
+
+ if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ OutStreamer->emitRawComment(" Kernel info:", false);
+ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+ false);
+ OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
+ false);
+ OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
+ false);
+ OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+ false);
+ OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+ false);
+ OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+ false);
+ } else {
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ OutStreamer->emitRawComment(
+ Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+ }
+ }
+
+ if (STM.dumpCode()) {
+
+ OutStreamer->SwitchSection(
+ Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
+
+ for (size_t i = 0; i < DisasmLines.size(); ++i) {
+ std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+ Comment += " ; " + HexLines[i] + "\n";
+
+ OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
+ OutStreamer->EmitBytes(StringRef(Comment));
+ }
+ }
+
+ return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+ unsigned MaxGPR = 0;
+ bool killPixel = false;
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const R600RegisterInfo *RI =
+ static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
+ const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::KILLGT)
+ killPixel = true;
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ const MachineOperand &MO = MI.getOperand(op_idx);
+ if (!MO.isReg())
+ continue;
+ unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+
+ // Register with value > 127 aren't GPR
+ if (HWReg > 127)
+ continue;
+ MaxGPR = std::max(MaxGPR, HWReg);
+ }
+ }
+ }
+
+ unsigned RsrcReg;
+ if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
+ // Evergreen / Northern Islands
+ switch (MFI->getShaderType()) {
+ default: // Fall through
+ case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+ case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+ case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+ case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+ }
+ } else {
+ // R600 / R700
+ switch (MFI->getShaderType()) {
+ default: // Fall through
+ case ShaderType::GEOMETRY: // Fall through
+ case ShaderType::COMPUTE: // Fall through
+ case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+ case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+ }
+ }
+
+ OutStreamer->EmitIntValue(RsrcReg, 4);
+ OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+ S_STACK_SIZE(MFI->StackSize), 4);
+ OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+ OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+ if (MFI->getShaderType() == ShaderType::COMPUTE) {
+ OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+ OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+ }
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ uint64_t CodeSize = 0;
+ unsigned MaxSGPR = 0;
+ unsigned MaxVGPR = 0;
+ bool VCCUsed = false;
+ bool FlatUsed = false;
+ const SIRegisterInfo *RI =
+ static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: CodeSize should account for multiple functions.
+ CodeSize += MI.getDesc().Size;
+
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ const MachineOperand &MO = MI.getOperand(op_idx);
+ unsigned width = 0;
+ bool isSGPR = false;
+
+ if (!MO.isReg()) {
+ continue;
+ }
+ unsigned reg = MO.getReg();
+ if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
+ reg == AMDGPU::VCC_HI) {
+ VCCUsed = true;
+ continue;
+ } else if (reg == AMDGPU::FLAT_SCR ||
+ reg == AMDGPU::FLAT_SCR_LO ||
+ reg == AMDGPU::FLAT_SCR_HI) {
+ FlatUsed = true;
+ continue;
+ }
+
+ switch (reg) {
+ default: break;
+ case AMDGPU::SCC:
+ case AMDGPU::EXEC:
+ case AMDGPU::M0:
+ continue;
+ }
+
+ if (AMDGPU::SReg_32RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 1;
+ } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 1;
+ } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 2;
+ } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 3;
+ } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 4;
+ } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 4;
+ } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 8;
+ } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 8;
+ } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+ isSGPR = true;
+ width = 16;
+ } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 16;
+ } else {
+ llvm_unreachable("Unknown register class");
+ }
+ unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
+ unsigned maxUsed = hwReg + width - 1;
+ if (isSGPR) {
+ MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+ } else {
+ MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+ }
+ }
+ }
+ }
+
+ if (VCCUsed)
+ MaxSGPR += 2;
+
+ if (FlatUsed)
+ MaxSGPR += 2;
+
+ // We found the maximum register index. They start at 0, so add one to get the
+ // number of registers.
+ ProgInfo.NumVGPR = MaxVGPR + 1;
+ ProgInfo.NumSGPR = MaxSGPR + 1;
+
+ if (STM.hasSGPRInitBug()) {
+ if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG)
+ llvm_unreachable("Too many SGPRs used with the SGPR init bug");
+
+ ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+ }
+
+ ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
+ ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
+ // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+ // register.
+ ProgInfo.FloatMode = getFPMode(MF);
+
+ // XXX: Not quite sure what this does, but sc seems to unset this.
+ ProgInfo.IEEEMode = 0;
+
+ // Do not clamp NAN to 0.
+ ProgInfo.DX10Clamp = 0;
+
+ const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+ ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+
+ ProgInfo.FlatUsed = FlatUsed;
+ ProgInfo.VCCUsed = VCCUsed;
+ ProgInfo.CodeLen = CodeSize;
+
+ unsigned LDSAlignShift;
+ if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+ // LDS is allocated in 64 dword blocks.
+ LDSAlignShift = 8;
+ } else {
+ // LDS is allocated in 128 dword blocks.
+ LDSAlignShift = 9;
+ }
+
+ unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
+ MFI->getMaximumWorkGroupSize(MF);
+
+ ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
+ ProgInfo.LDSBlocks =
+ RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+
+ // Scratch is allocated in 256 dword blocks.
+ unsigned ScratchAlignShift = 10;
+ // We need to program the hardware with the amount of scratch memory that
+ // is used by the entire wave. ProgInfo.ScratchSize is the amount of
+ // scratch memory used per thread.
+ ProgInfo.ScratchBlocks =
+ RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
+ 1 << ScratchAlignShift) >> ScratchAlignShift;
+
+ ProgInfo.ComputePGMRSrc1 =
+ S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+ S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+ S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+ S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+ S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+ ProgInfo.ComputePGMRSrc2 =
+ S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+ S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
+ S_00B84C_TGID_X_EN(1) |
+ S_00B84C_TGID_Y_EN(1) |
+ S_00B84C_TGID_Z_EN(1) |
+ S_00B84C_TG_SIZE_EN(1) |
+ S_00B84C_TIDIG_COMP_CNT(2) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+}
+
+static unsigned getRsrcReg(unsigned ShaderType) {
+ switch (ShaderType) {
+ default: // Fall through
+ case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1;
+ case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+ case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+ case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+ }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) {
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
+
+ if (MFI->getShaderType() == ShaderType::COMPUTE) {
+ OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+ OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+
+ OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+ OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+
+ OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+ OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+
+ // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+ // 0" comment but I don't see a corresponding field in the register spec.
+ } else {
+ OutStreamer->EmitIntValue(RsrcReg, 4);
+ OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+ S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+ if (STM.isVGPRSpillingEnabled(MFI)) {
+ OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+ OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+ }
+ }
+
+ if (MFI->getShaderType() == ShaderType::PIXEL) {
+ OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+ OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+ OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
+ }
+}
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+ amd_kernel_code_t header;
+
+ memset(&header, 0, sizeof(header));
+
+ header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
+ header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
+
+ header.struct_byte_size = sizeof(amd_kernel_code_t);
+
+ header.target_chip = STM.getAmdKernelCodeChipID();
+
+ header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
+
+ header.compute_pgm_resource_registers =
+ KernelInfo.ComputePGMRSrc1 |
+ (KernelInfo.ComputePGMRSrc2 << 32);
+
+ // Code Properties:
+ header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
+ AMD_CODE_PROPERTY_IS_PTR64;
+
+ if (KernelInfo.FlatUsed)
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ if (KernelInfo.ScratchBlocks)
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
+ header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+ header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+
+ // MFI->ABIArgOffset is the number of bytes for the kernel arguments
+ // plus 36. 36 is the number of bytes reserved at the begining of the
+ // input buffer to store work-group size information.
+ // FIXME: We should be adding the size of the implicit arguments
+ // to this value.
+ header.kernarg_segment_byte_size = MFI->ABIArgOffset;
+
+ header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+ header.workitem_vgpr_count = KernelInfo.NumVGPR;
+
+ // FIXME: What values do I put for these alignments
+ header.kernarg_segment_alignment = 0;
+ header.group_segment_alignment = 0;
+ header.private_segment_alignment = 0;
+
+ header.code_type = 1; // HSA_EXT_CODE_KERNEL
+
+ header.wavefront_size = STM.getWavefrontSize();
+
+ MCSectionELF *VersionSection =
+ OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0);
+ OutStreamer->SwitchSection(VersionSection);
+ OutStreamer->EmitBytes(Twine("HSA Code Unit:" +
+ Twine(header.hsail_version_major) + "." +
+ Twine(header.hsail_version_minor) + ":" +
+ "AMD:" +
+ Twine(header.amd_code_version_major) + "." +
+ Twine(header.amd_code_version_minor) + ":" +
+ "GFX8.1:0").str());
+
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+ if (isVerbose()) {
+ OutStreamer->emitRawComment("amd_code_version_major = " +
+ Twine(header.amd_code_version_major), false);
+ OutStreamer->emitRawComment("amd_code_version_minor = " +
+ Twine(header.amd_code_version_minor), false);
+ OutStreamer->emitRawComment("struct_byte_size = " +
+ Twine(header.struct_byte_size), false);
+ OutStreamer->emitRawComment("target_chip = " +
+ Twine(header.target_chip), false);
+ OutStreamer->emitRawComment(" compute_pgm_rsrc1: " +
+ Twine::utohexstr(KernelInfo.ComputePGMRSrc1),
+ false);
+ OutStreamer->emitRawComment(" compute_pgm_rsrc2: " +
+ Twine::utohexstr(KernelInfo.ComputePGMRSrc2),
+ false);
+ OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " +
+ Twine((bool)(header.code_properties &
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
+ OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
+ Twine((bool)(header.code_properties &
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
+ OutStreamer->emitRawComment("private_element_size = 2 ", false);
+ OutStreamer->emitRawComment("is_ptr64 = " +
+ Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
+ OutStreamer->emitRawComment("workitem_private_segment_byte_size = " +
+ Twine(header.workitem_private_segment_byte_size),
+ false);
+ OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " +
+ Twine(header.workgroup_group_segment_byte_size),
+ false);
+ OutStreamer->emitRawComment("gds_segment_byte_size = " +
+ Twine(header.gds_segment_byte_size), false);
+ OutStreamer->emitRawComment("kernarg_segment_byte_size = " +
+ Twine(header.kernarg_segment_byte_size), false);
+ OutStreamer->emitRawComment("wavefront_sgpr_count = " +
+ Twine(header.wavefront_sgpr_count), false);
+ OutStreamer->emitRawComment("workitem_vgpr_count = " +
+ Twine(header.workitem_vgpr_count), false);
+ OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false);
+ OutStreamer->emitRawComment("wavefront_size = " +
+ Twine((int)header.wavefront_size), false);
+ OutStreamer->emitRawComment("optimization_level = " +
+ Twine(header.optimization_level), false);
+ OutStreamer->emitRawComment("hsail_profile = " +
+ Twine(header.hsail_profile), false);
+ OutStreamer->emitRawComment("hsail_machine_model = " +
+ Twine(header.hsail_machine_model), false);
+ OutStreamer->emitRawComment("hsail_version_major = " +
+ Twine(header.hsail_version_major), false);
+ OutStreamer->emitRawComment("hsail_version_minor = " +
+ Twine(header.hsail_version_minor), false);
+ }
+
+ OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header)));
+}
+
+bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'r':
+ break;
+ }
+ }
+
+ AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
+ *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
+ return false;
+}
--- /dev/null
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include <vector>
+
+namespace llvm {
+
+class AMDGPUAsmPrinter : public AsmPrinter {
+private:
+ struct SIProgramInfo {
+ SIProgramInfo() :
+ VGPRBlocks(0),
+ SGPRBlocks(0),
+ Priority(0),
+ FloatMode(0),
+ Priv(0),
+ DX10Clamp(0),
+ DebugMode(0),
+ IEEEMode(0),
+ ScratchSize(0),
+ ComputePGMRSrc1(0),
+ LDSBlocks(0),
+ ScratchBlocks(0),
+ ComputePGMRSrc2(0),
+ NumVGPR(0),
+ NumSGPR(0),
+ FlatUsed(false),
+ VCCUsed(false),
+ CodeLen(0) {}
+
+ // Fields set in PGM_RSRC1 pm4 packet.
+ uint32_t VGPRBlocks;
+ uint32_t SGPRBlocks;
+ uint32_t Priority;
+ uint32_t FloatMode;
+ uint32_t Priv;
+ uint32_t DX10Clamp;
+ uint32_t DebugMode;
+ uint32_t IEEEMode;
+ uint32_t ScratchSize;
+
+ uint64_t ComputePGMRSrc1;
+
+ // Fields set in PGM_RSRC2 pm4 packet.
+ uint32_t LDSBlocks;
+ uint32_t ScratchBlocks;
+
+ uint64_t ComputePGMRSrc2;
+
+ uint32_t NumVGPR;
+ uint32_t NumSGPR;
+ uint32_t LDSSize;
+ bool FlatUsed;
+
+ // Bonus information for debugging.
+ bool VCCUsed;
+ uint64_t CodeLen;
+ };
+
+ void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+ void findNumUsedRegistersSI(const MachineFunction &MF,
+ unsigned &NumSGPR,
+ unsigned &NumVGPR) const;
+
+ /// \brief Emit register usage information so that the GPU driver
+ /// can correctly setup the GPU state.
+ void EmitProgramInfoR600(const MachineFunction &MF);
+ void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+ void EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const;
+
+public:
+ explicit AMDGPUAsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "AMDGPU Assembly Printer";
+ }
+
+ /// Implemented in AMDGPUMCInstLower.cpp
+ void EmitInstruction(const MachineInstr *MI) override;
+
+ void EmitEndOfAsmFile(Module &M) override;
+
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &O) override;
+
+protected:
+ std::vector<std::string> DisasmLines, HexLines;
+ size_t DisasmLineMaxLen;
+};
+
+} // End anonymous llvm
+
+#endif
--- /dev/null
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+ CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+ ]>>>,
+
+ CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+ >>>,
+
+ CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+ ]>>>,
+
+ CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+ >>>
+
+]>;
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+ CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+ T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+ T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+ T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+ T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+ T30_XYZW, T31_XYZW, T32_XYZW
+ ]>>>
+]>;
+
+// Calling convention for compute kernels
+def CC_AMDGPU_Kernel : CallingConv<[
+ CCCustom<"allocateStack">
+]>;
+
+def CC_AMDGPU : CallingConv<[
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >="
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
+ "->getShaderType() == ShaderType::COMPUTE",
+ CCDelegateTo<CC_AMDGPU_Kernel>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
+ "->getShaderType() == ShaderType::COMPUTE",
+ CCDelegateTo<CC_AMDGPU_Kernel>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+ CCDelegateTo<CC_SI>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+ CCDelegateTo<CC_R600>>
+]>;
--- /dev/null
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+ int LAO, unsigned TransAl)
+ : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+
+ // XXX: Hardcoding to 1 for now.
+ //
+ // I think the StackWidth should stored as metadata associated with the
+ // MachineFunction. This metadata can either be added by a frontend, or
+ // calculated by a R600 specific LLVM IR pass.
+ //
+ // The StackWidth determines how stack objects are laid out in memory.
+ // For a vector stack variable, like: int4 stack[2], the data will be stored
+ // in the following ways depending on the StackWidth.
+ //
+ // StackWidth = 1:
+ //
+ // T0.X = stack[0].x
+ // T1.X = stack[0].y
+ // T2.X = stack[0].z
+ // T3.X = stack[0].w
+ // T4.X = stack[1].x
+ // T5.X = stack[1].y
+ // T6.X = stack[1].z
+ // T7.X = stack[1].w
+ //
+ // StackWidth = 2:
+ //
+ // T0.X = stack[0].x
+ // T0.Y = stack[0].y
+ // T1.X = stack[0].z
+ // T1.Y = stack[0].w
+ // T2.X = stack[1].x
+ // T2.Y = stack[1].y
+ // T3.X = stack[1].z
+ // T3.Y = stack[1].w
+ //
+ // StackWidth = 4:
+ // T0.X = stack[0].x
+ // T0.Y = stack[0].y
+ // T0.Z = stack[0].z
+ // T0.W = stack[0].w
+ // T1.X = stack[1].x
+ // T1.Y = stack[1].y
+ // T1.Z = stack[1].z
+ // T1.W = stack[1].w
+ return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+ int FI) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ // Start the offset at 2 so we don't overwrite work group information.
+ // XXX: We should only do this when the shader actually uses this
+ // information.
+ unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+ int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
+
+ for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
+ OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+ OffsetBytes += MFI->getObjectSize(i);
+ // Each register holds 4 bytes, so we must always align the offset to at
+ // least 4 bytes, so that 2 frame objects won't share the same register.
+ OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
+ }
+
+ if (FI != -1)
+ OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+
+ return OffsetBytes / (getStackWidth(MF) * 4);
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+ NumEntries = 0;
+ return nullptr;
+}
+void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {}
+void
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+}
+
+bool
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
+ return false;
+}
--- /dev/null
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on a AMDIL target
+/// machine.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+ AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+ unsigned TransAl = 1);
+ virtual ~AMDGPUFrameLowering();
+
+ /// \returns The number of 32-bit sub-registers that are used when storing
+ /// values to the stack.
+ unsigned getStackWidth(const MachineFunction &MF) const;
+ int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ const SpillSlot *
+ getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ bool hasFP(const MachineFunction &MF) const override;
+};
+} // namespace llvm
+#endif
--- /dev/null
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIDefines.h"
+#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+ // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+ // make the right decision when generating code for different targets.
+ const AMDGPUSubtarget *Subtarget;
+public:
+ AMDGPUDAGToDAGISel(TargetMachine &TM);
+ virtual ~AMDGPUDAGToDAGISel();
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ SDNode *Select(SDNode *N) override;
+ const char *getPassName() const override;
+ void PostprocessISelDAG() override;
+
+private:
+ bool isInlineImmediate(SDNode *N) const;
+ bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+ const R600InstrInfo *TII);
+ bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+ bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+ // Complex pattern selectors
+ bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+ bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+ bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+ static bool checkType(const Value *ptr, unsigned int addrspace);
+ static bool checkPrivateAddress(const MachineMemOperand *Op);
+
+ static bool isGlobalStore(const StoreSDNode *N);
+ static bool isFlatStore(const StoreSDNode *N);
+ static bool isPrivateStore(const StoreSDNode *N);
+ static bool isLocalStore(const StoreSDNode *N);
+ static bool isRegionStore(const StoreSDNode *N);
+
+ bool isCPLoad(const LoadSDNode *N) const;
+ bool isConstantLoad(const LoadSDNode *N, int cbID) const;
+ bool isGlobalLoad(const LoadSDNode *N) const;
+ bool isFlatLoad(const LoadSDNode *N) const;
+ bool isParamLoad(const LoadSDNode *N) const;
+ bool isPrivateLoad(const LoadSDNode *N) const;
+ bool isLocalLoad(const LoadSDNode *N) const;
+ bool isRegionLoad(const LoadSDNode *N) const;
+
+ SDNode *glueCopyToM0(SDNode *N) const;
+
+ const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+ bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+ bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+ SDValue& Offset);
+ bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+ bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+ bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ unsigned OffsetBits) const;
+ bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+ bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+ SDValue &Offset1) const;
+ void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const;
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset, SDValue &GLC,
+ SDValue &SLC, SDValue &TFE) const;
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
+ SDValue &SLC) const;
+ bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &ImmOffset) const;
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
+ SDValue &Offset, SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const;
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+ SDValue &Offset, SDValue &GLC) const;
+ SDNode *SelectAddrSpaceCast(SDNode *N);
+ bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp, SDValue &Omod) const;
+
+ bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Omod) const;
+ bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const;
+
+ SDNode *SelectADD_SUB_I64(SDNode *N);
+ SDNode *SelectDIV_SCALE(SDNode *N);
+
+ SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
+ uint32_t Offset, uint32_t Width);
+ SDNode *SelectS_BFEFromShifts(SDNode *N);
+ SDNode *SelectS_BFE(SDNode *N);
+
+ // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+} // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
+ return new AMDGPUDAGToDAGISel(TM);
+}
+
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
+ : SelectionDAGISel(TM) {}
+
+bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+ Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget());
+ return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
+}
+
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+ const SITargetLowering *TL
+ = static_cast<const SITargetLowering *>(getTargetLowering());
+ return TL->analyzeImmediate(N) == 0;
+}
+
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+ unsigned OpNo) const {
+ if (!N->isMachineOpcode())
+ return nullptr;
+
+ switch (N->getMachineOpcode()) {
+ default: {
+ const MCInstrDesc &Desc =
+ Subtarget->getInstrInfo()->get(N->getMachineOpcode());
+ unsigned OpIdx = Desc.getNumDefs() + OpNo;
+ if (OpIdx >= Desc.getNumOperands())
+ return nullptr;
+ int RegClass = Desc.OpInfo[OpIdx].RegClass;
+ if (RegClass == -1)
+ return nullptr;
+
+ return Subtarget->getRegisterInfo()->getRegClass(RegClass);
+ }
+ case AMDGPU::REG_SEQUENCE: {
+ unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ const TargetRegisterClass *SuperRC =
+ Subtarget->getRegisterInfo()->getRegClass(RCID);
+
+ SDValue SubRegOp = N->getOperand(OpNo + 1);
+ unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
+ return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
+ SubRegIdx);
+ }
+ }
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
+ SDValue Addr, SDValue& R1, SDValue& R2) {
+
+ if (Addr.getOpcode() == ISD::FrameIndex) {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+ R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ } else {
+ R1 = Addr;
+ R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ }
+ } else if (Addr.getOpcode() == ISD::ADD) {
+ R1 = Addr.getOperand(0);
+ R2 = Addr.getOperand(1);
+ } else {
+ R1 = Addr;
+ R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ }
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress) {
+ return false;
+ }
+ return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress) {
+ return false;
+ }
+
+ if (Addr.getOpcode() == ISD::FrameIndex) {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+ R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
+ } else {
+ R1 = Addr;
+ R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
+ }
+ } else if (Addr.getOpcode() == ISD::ADD) {
+ R1 = Addr.getOperand(0);
+ R2 = Addr.getOperand(1);
+ } else {
+ R1 = Addr;
+ R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64);
+ }
+ return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(),
+ AMDGPUAS::LOCAL_ADDRESS))
+ return N;
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ // Write max value to m0 before each load operation
+
+ SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
+ CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+
+ SDValue Glue = M0.getValue(1);
+
+ SmallVector <SDValue, 8> Ops;
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+ Ops.push_back(Glue);
+ CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
+
+ return N;
+}
+
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+ unsigned int Opc = N->getOpcode();
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return nullptr; // Already selected.
+ }
+
+ if (isa<AtomicSDNode>(N))
+ N = glueCopyToM0(N);
+
+ switch (Opc) {
+ default: break;
+ // We are selecting i64 ADD here instead of custom lower it during
+ // DAG legalization, so we can fold some i64 ADDs used for address
+ // calculation into the LOAD and STORE instructions.
+ case ISD::ADD:
+ case ISD::SUB: {
+ if (N->getValueType(0) != MVT::i64 ||
+ Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ return SelectADD_SUB_I64(N);
+ }
+ case ISD::SCALAR_TO_VECTOR:
+ case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+ case ISD::BUILD_VECTOR: {
+ unsigned RegClassID;
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ EVT VT = N->getValueType(0);
+ unsigned NumVectorElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ assert(EltVT.bitsEq(MVT::i32));
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ bool UseVReg = true;
+ for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+ U != E; ++U) {
+ if (!U->isMachineOpcode()) {
+ continue;
+ }
+ const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+ if (!RC) {
+ continue;
+ }
+ if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
+ UseVReg = false;
+ }
+ }
+ switch(NumVectorElts) {
+ case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
+ AMDGPU::SReg_32RegClassID;
+ break;
+ case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+ AMDGPU::SReg_64RegClassID;
+ break;
+ case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+ AMDGPU::SReg_128RegClassID;
+ break;
+ case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+ AMDGPU::SReg_256RegClassID;
+ break;
+ case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+ AMDGPU::SReg_512RegClassID;
+ break;
+ default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ }
+ } else {
+ // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+ // that adds a 128 bits reg copy when going through TwoAddressInstructions
+ // pass. We want to avoid 128 bits copies as much as possible because they
+ // can't be bundled by our scheduler.
+ switch(NumVectorElts) {
+ case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+ case 4:
+ if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+ RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+ else
+ RegClassID = AMDGPU::R600_Reg128RegClassID;
+ break;
+ default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+ }
+ }
+
+ SDLoc DL(N);
+ SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+
+ if (NumVectorElts == 1) {
+ return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
+ N->getOperand(0), RegClass);
+ }
+
+ assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+ "supported yet");
+ // 16 = Max Num Vector Elements
+ // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+ // 1 = Vector Register Class
+ SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+ RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+ bool IsRegSeq = true;
+ unsigned NOps = N->getNumOperands();
+ for (unsigned i = 0; i < NOps; i++) {
+ // XXX: Why is this here?
+ if (isa<RegisterSDNode>(N->getOperand(i))) {
+ IsRegSeq = false;
+ break;
+ }
+ RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
+ MVT::i32);
+ }
+
+ if (NOps != NumVectorElts) {
+ // Fill in the missing undef elements if this was a scalar_to_vector.
+ assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+ MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ DL, EltVT);
+ for (unsigned i = NOps; i < NumVectorElts; ++i) {
+ RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+ RegSeqArgs[1 + (2 * i) + 1] =
+ CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+ }
+ }
+
+ if (!IsRegSeq)
+ break;
+ return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+ RegSeqArgs);
+ }
+ case ISD::BUILD_PAIR: {
+ SDValue RC, SubReg0, SubReg1;
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ break;
+ }
+ SDLoc DL(N);
+ if (N->getValueType(0) == MVT::i128) {
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+ SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
+ SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
+ } else if (N->getValueType(0) == MVT::i64) {
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
+ SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+ } else {
+ llvm_unreachable("Unhandled value type for BUILD_PAIR");
+ }
+ const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+ N->getOperand(1), SubReg1 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+ DL, N->getValueType(0), Ops);
+ }
+
+ case ISD::Constant:
+ case ISD::ConstantFP: {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+ break;
+
+ uint64_t Imm;
+ if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+ Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+ else {
+ ConstantSDNode *C = cast<ConstantSDNode>(N);
+ Imm = C->getZExtValue();
+ }
+
+ SDLoc DL(N);
+ SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
+ MVT::i32));
+ SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
+
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+ N->getValueType(0), Ops);
+ }
+
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
+ N = glueCopyToM0(N);
+ break;
+ }
+
+ // To simplify the TableGen patters, we replace all i64 loads with
+ // v2i32 loads. Alternatively, we could promote i64 loads to v2i32
+ // during DAG legalization, however, so places (ExpandUnalignedLoad)
+ // in the DAG legalizer assume that if i64 is legal, so doing this
+ // promotion early can cause problems.
+
+ SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
+ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
+ MVT::i64, NewLoad);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
+ SDNode *Load = glueCopyToM0(NewLoad.getNode());
+ SelectCode(Load);
+ N = BitCast.getNode();
+ break;
+ }
+
+ case ISD::STORE: {
+ // Handle i64 stores here for the same reason mentioned above for loads.
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ SDValue Value = ST->getValue();
+ if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
+
+ SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+ MVT::v2i32, Value);
+ SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
+ ST->getBasePtr(), ST->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
+
+ if (NewValue.getOpcode() == ISD::BITCAST) {
+ Select(NewStore.getNode());
+ return SelectCode(NewValue.getNode());
+ }
+
+ // getNode() may fold the bitcast if its input was another bitcast. If that
+ // happens we should only select the new store.
+ N = NewStore.getNode();
+ }
+
+ N = glueCopyToM0(N);
+ break;
+ }
+
+ case AMDGPUISD::REGISTER_LOAD: {
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+ break;
+ SDValue Addr, Offset;
+
+ SDLoc DL(N);
+ SelectADDRIndirect(N->getOperand(1), Addr, Offset);
+ const SDValue Ops[] = {
+ Addr,
+ Offset,
+ CurDAG->getTargetConstant(0, DL, MVT::i32),
+ N->getOperand(0),
+ };
+ return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
+ CurDAG->getVTList(MVT::i32, MVT::i64,
+ MVT::Other),
+ Ops);
+ }
+ case AMDGPUISD::REGISTER_STORE: {
+ if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+ break;
+ SDValue Addr, Offset;
+ SelectADDRIndirect(N->getOperand(2), Addr, Offset);
+ SDLoc DL(N);
+ const SDValue Ops[] = {
+ N->getOperand(1),
+ Addr,
+ Offset,
+ CurDAG->getTargetConstant(0, DL, MVT::i32),
+ N->getOperand(0),
+ };
+ return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
+ CurDAG->getVTList(MVT::Other),
+ Ops);
+ }
+
+ case AMDGPUISD::BFE_I32:
+ case AMDGPUISD::BFE_U32: {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ // There is a scalar version available, but unlike the vector version which
+ // has a separate operand for the offset and width, the scalar version packs
+ // the width and offset into a single operand. Try to move to the scalar
+ // version if the offsets are constant, so that we can try to keep extended
+ // loads of kernel arguments in SGPRs.
+
+ // TODO: Technically we could try to pattern match scalar bitshifts of
+ // dynamic values, but it's probably not useful.
+ ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Offset)
+ break;
+
+ ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!Width)
+ break;
+
+ bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+ uint32_t OffsetVal = Offset->getZExtValue();
+ uint32_t WidthVal = Width->getZExtValue();
+
+ return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
+ N->getOperand(0), OffsetVal, WidthVal);
+
+ }
+ case AMDGPUISD::DIV_SCALE: {
+ return SelectDIV_SCALE(N);
+ }
+ case ISD::CopyToReg: {
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+ Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+ break;
+ }
+ case ISD::ADDRSPACECAST:
+ return SelectAddrSpaceCast(N);
+ case ISD::AND:
+ case ISD::SRL:
+ case ISD::SRA:
+ if (N->getValueType(0) != MVT::i32 ||
+ Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+ break;
+
+ return SelectS_BFE(N);
+ }
+
+ return SelectCode(N);
+}
+
+
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+ assert(AS != 0 && "Use checkPrivateAddress instead.");
+ if (!Ptr)
+ return false;
+
+ return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+ if (Op->getPseudoValue())
+ return true;
+
+ if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+ return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+ const Value *MemVal = N->getMemOperand()->getValue();
+ return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
+}
+
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
+ const Value *MemVal = N->getMemOperand()->getValue();
+ if (CbId == -1)
+ return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+ return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
+ if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+ N->getMemoryVT().bitsLT(MVT::i32))
+ return true;
+
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
+ MachineMemOperand *MMO = N->getMemOperand();
+ if (checkPrivateAddress(N->getMemOperand())) {
+ if (MMO) {
+ const PseudoSourceValue *PSV = MMO->getPseudoValue();
+ if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
+ if (checkPrivateAddress(N->getMemOperand())) {
+ // Check to make sure we are not a constant pool load or a constant load
+ // that is marked as a private load
+ if (isCPLoad(N) || isConstantLoad(N, -1)) {
+ return false;
+ }
+ }
+
+ const Value *MemVal = N->getMemOperand()->getValue();
+ if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
+ return true;
+ }
+ return false;
+}
+
+const char *AMDGPUDAGToDAGISel::getPassName() const {
+ return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+ SDValue& IntPtr) {
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+ IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+ true);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+ SDValue& BaseReg, SDValue &Offset) {
+ if (!isa<ConstantSDNode>(Addr)) {
+ BaseReg = Addr;
+ Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ ConstantSDNode *IMMOffset;
+
+ if (Addr.getOpcode() == ISD::ADD
+ && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+ && isInt<16>(IMMOffset->getZExtValue())) {
+
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ // If the pointer address is constant, we can move it to the offset field.
+ } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+ && isInt<16>(IMMOffset->getZExtValue())) {
+ Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+ SDLoc(CurDAG->getEntryNode()),
+ AMDGPU::ZERO, MVT::i32);
+ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+ MVT::i32);
+ return true;
+ }
+
+ // Default case, no offset
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+ SDValue &Offset) {
+ ConstantSDNode *C;
+ SDLoc DL(Addr);
+
+ if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else {
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ }
+
+ return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ bool IsAdd = (N->getOpcode() == ISD::ADD);
+
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub0);
+ SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, LHS, Sub1);
+
+ SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub0);
+ SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, RHS, Sub1);
+
+ SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+ SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+
+
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+ SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
+ SDValue Carry(AddLo, 1);
+ SDNode *AddHi
+ = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
+ SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
+
+ SDValue Args[5] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ SDValue(AddLo,0),
+ Sub0,
+ SDValue(AddHi,0),
+ Sub1,
+ };
+ return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ assert(VT == MVT::f32 || VT == MVT::f64);
+
+ unsigned Opc
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ SDValue Ops[8];
+
+ SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
+ return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ unsigned OffsetBits) const {
+ if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+ (OffsetBits == 8 && !isUInt<8>(Offset)))
+ return false;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return CurDAG->SignBitIsZero(Base);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+ // (add n0, c0)
+ Base = N0;
+ Offset = N1;
+ return true;
+ }
+ }
+
+ SDLoc DL(Addr);
+
+ // If we have a constant address, prefer to put the constant into the
+ // offset. This can save moves to load the constant address since multiple
+ // operations can share the zero base address register, and enables merging
+ // into read2 / write2 instructions.
+ if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ if (isUInt<16>(CAddr->getZExtValue())) {
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ DL, MVT::i32, Zero);
+ Base = SDValue(MovZero, 0);
+ Offset = Addr;
+ return true;
+ }
+ }
+
+ // default case
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+ SDValue &Offset0,
+ SDValue &Offset1) const {
+ SDLoc DL(Addr);
+
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ unsigned DWordOffset0 = C1->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+ // (add n0, c0)
+ if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+ Base = N0;
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ return true;
+ }
+ }
+
+ if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+ assert(4 * DWordOffset0 == CAddr->getZExtValue());
+
+ if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ MachineSDNode *MovZero
+ = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ DL, MVT::i32, Zero);
+