AMDGPU: Add core backend files for R600/SI codegen v6
authorTom Stellard <thomas.stellard@amd.com>
Mon, 16 Jul 2012 14:17:08 +0000 (14:17 +0000)
committerTom Stellard <thomas.stellard@amd.com>
Mon, 16 Jul 2012 14:17:08 +0000 (14:17 +0000)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@160270 91177308-0d34-0410-b5e6-96231b3b80d8

114 files changed:
lib/Target/AMDGPU/AMDGPU.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPU.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUConvertToISA.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUISelLowering.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUISelLowering.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUInstrInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUInstrInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUInstrInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUInstructions.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUIntrinsics.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPURegisterInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPURegisterInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPURegisterInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUSubtarget.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUTargetMachine.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUUtil.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDGPUUtil.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDIL.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDIL7XXDevice.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDIL7XXDevice.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILAlgorithms.tpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILBase.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILCFGStructurizer.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILCallingConv.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILCodeEmitter.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILDevice.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILDevice.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILDeviceInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILDeviceInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILDevices.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILEnumeratedTypes.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILEvergreenDevice.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILEvergreenDevice.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILFormats.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILFrameLowering.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILFrameLowering.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILISelLowering.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILISelLowering.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILInstrInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILInstrInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILInstrInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILInstructions.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILIntrinsicInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILIntrinsicInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILIntrinsics.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILMultiClass.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILNIDevice.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILNIDevice.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILNodes.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILOperands.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILPatterns.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILPeepholeOptimizer.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILProfiles.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILRegisterInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILRegisterInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILRegisterInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILSIDevice.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILSIDevice.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILSubtarget.cpp [new file with mode: 0644]
lib/Target/AMDGPU/AMDILSubtarget.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILTokenDesc.td [new file with mode: 0644]
lib/Target/AMDGPU/AMDILUtilityFunctions.h [new file with mode: 0644]
lib/Target/AMDGPU/AMDILVersion.td [new file with mode: 0644]
lib/Target/AMDGPU/CMakeLists.txt [new file with mode: 0644]
lib/Target/AMDGPU/GENERATED_FILES [new file with mode: 0644]
lib/Target/AMDGPU/LLVMBuild.txt [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt [new file with mode: 0644]
lib/Target/AMDGPU/MCTargetDesc/Makefile [new file with mode: 0644]
lib/Target/AMDGPU/Makefile [new file with mode: 0644]
lib/Target/AMDGPU/Processors.td [new file with mode: 0644]
lib/Target/AMDGPU/R600CodeEmitter.cpp [new file with mode: 0644]
lib/Target/AMDGPU/R600GenRegisterInfo.pl [new file with mode: 0644]
lib/Target/AMDGPU/R600HwRegInfo.include [new file with mode: 0644]
lib/Target/AMDGPU/R600ISelLowering.cpp [new file with mode: 0644]
lib/Target/AMDGPU/R600ISelLowering.h [new file with mode: 0644]
lib/Target/AMDGPU/R600InstrInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/R600InstrInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/R600Instructions.td [new file with mode: 0644]
lib/Target/AMDGPU/R600Intrinsics.td [new file with mode: 0644]
lib/Target/AMDGPU/R600KernelParameters.cpp [new file with mode: 0644]
lib/Target/AMDGPU/R600MachineFunctionInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/R600MachineFunctionInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/R600RegisterInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/R600RegisterInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/R600RegisterInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/R600Schedule.td [new file with mode: 0644]
lib/Target/AMDGPU/SIAssignInterpRegs.cpp [new file with mode: 0644]
lib/Target/AMDGPU/SICodeEmitter.cpp [new file with mode: 0644]
lib/Target/AMDGPU/SIGenRegisterInfo.pl [new file with mode: 0644]
lib/Target/AMDGPU/SIISelLowering.cpp [new file with mode: 0644]
lib/Target/AMDGPU/SIISelLowering.h [new file with mode: 0644]
lib/Target/AMDGPU/SIInstrFormats.td [new file with mode: 0644]
lib/Target/AMDGPU/SIInstrInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/SIInstrInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/SIInstrInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/SIInstructions.td [new file with mode: 0644]
lib/Target/AMDGPU/SIIntrinsics.td [new file with mode: 0644]
lib/Target/AMDGPU/SIMachineFunctionInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/SIMachineFunctionInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/SIRegisterInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/SIRegisterInfo.h [new file with mode: 0644]
lib/Target/AMDGPU/SIRegisterInfo.td [new file with mode: 0644]
lib/Target/AMDGPU/SISchedule.td [new file with mode: 0644]
lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp [new file with mode: 0644]
lib/Target/AMDGPU/TargetInfo/CMakeLists.txt [new file with mode: 0644]
lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt [new file with mode: 0644]
lib/Target/AMDGPU/TargetInfo/Makefile [new file with mode: 0644]

diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
new file mode 100644 (file)
index 0000000..191f495
--- /dev/null
@@ -0,0 +1,35 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_H
+#define AMDGPU_H
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class FunctionPass;
+class AMDGPUTargetMachine;
+
+// R600 Passes
+FunctionPass* createR600KernelParametersPass(const TargetData* TD);
+FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS);
+
+// SI Passes
+FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+
+} // End namespace llvm
+
+#endif // AMDGPU_H
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
new file mode 100644 (file)
index 0000000..1bb5fb9
--- /dev/null
@@ -0,0 +1,21 @@
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+// Include AMDIL TD files
+include "AMDILBase.td"
+include "AMDILVersion.td"
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
diff --git a/lib/Target/AMDGPU/AMDGPUConvertToISA.cpp b/lib/Target/AMDGPU/AMDGPUConvertToISA.cpp
new file mode 100644 (file)
index 0000000..5e8fe9a
--- /dev/null
@@ -0,0 +1,63 @@
+//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers AMDIL machine instructions to the appropriate hardware
+// instructions. 
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+#include <stdio.h>
+using namespace llvm;
+
+namespace {
+
+class AMDGPUConvertToISAPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  TargetMachine &TM;
+
+public:
+  AMDGPUConvertToISAPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TM(tm) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+
+};
+
+} // End anonymous namespace
+
+char AMDGPUConvertToISAPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
+  return new AMDGPUConvertToISAPass(tm);
+}
+
+bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF)
+{
+  const AMDGPUInstrInfo * TII =
+                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
new file mode 100644 (file)
index 0000000..b3d27f7
--- /dev/null
@@ -0,0 +1,393 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the parent TargetLowering class for hardware code gen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUUtil.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
+  AMDILTargetLowering(TM)
+{
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+}
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
+    const
+{
+  switch (Op.getOpcode()) {
+  default: return AMDILTargetLowering::LowerOperation(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  }
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  switch (IntrinsicID) {
+    default: return Op;
+    case AMDGPUIntrinsic::AMDIL_abs:
+      return LowerIntrinsicIABS(Op, DAG);
+    case AMDGPUIntrinsic::AMDIL_exp:
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDIL_fabs:
+      return DAG.getNode(ISD::FABS, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_lrp:
+      return LowerIntrinsicLRP(Op, DAG);
+    case AMDGPUIntrinsic::AMDIL_fraction:
+      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDIL_mad:
+      return DAG.getNode(AMDILISD::MAD, DL, VT, Op.getOperand(1),
+                              Op.getOperand(2), Op.getOperand(3));
+    case AMDGPUIntrinsic::AMDIL_max:
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imax:
+      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umax:
+      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDIL_min:
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imin:
+      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDIL_round_nearest:
+      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDIL_round_posinf:
+      return DAG.getNode(ISD::FCEIL, DL, VT, Op.getOperand(1));
+  }
+}
+
+///IABS(a) = SMAX(sub(0, a), a)
+SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
+    SelectionDAG &DAG) const
+{
+
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                              Op.getOperand(1));
+
+  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
+}
+
+/// Linear Interpolation
+/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
+SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
+                                DAG.getConstantFP(1.0f, MVT::f32),
+                                Op.getOperand(1));
+  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
+                                                    Op.getOperand(3));
+  return DAG.getNode(AMDILISD::MAD, DL, VT, Op.getOperand(1),
+                                               Op.getOperand(2),
+                                               OneSubAC);
+}
+
+SDValue AMDGPUTargetLowering::LowerSELECT_CC(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDValue Temp;
+
+  // LHS and RHS are guaranteed to be the same value type
+  EVT CompareVT = LHS.getValueType();
+
+  // We need all the operands of SELECT_CC to have the same value type, so if
+  // necessary we need to convert LHS and RHS to be the same type True and
+  // False.  True and False are guaranteed to have the same type as this
+  // SELECT_CC node.
+
+  if (CompareVT !=  VT) {
+    ISD::NodeType ConversionOp = ISD::DELETED_NODE;
+    if (VT == MVT::f32 && CompareVT == MVT::i32) {
+      if (isUnsignedIntSetCC(CCOpcode)) {
+        ConversionOp = ISD::UINT_TO_FP;
+      } else {
+        ConversionOp = ISD::SINT_TO_FP;
+      }
+    } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
+      ConversionOp = ISD::FP_TO_SINT;
+    } else {
+      // I don't think there will be any other type pairings.
+      assert(!"Unhandled operand type parings in SELECT_CC");
+    }
+    // XXX Check the value of LHS and RHS and avoid creating sequences like
+    // (FTOI (ITOF))
+    LHS = DAG.getNode(ConversionOp, DL, VT, LHS);
+    RHS = DAG.getNode(ConversionOp, DL, VT, RHS);
+  }
+
+  // If True is a hardware TRUE value and False is a hardware FALSE value or
+  // vice-versa we can handle this with a native instruction (SET* instructions).
+  if ((isHWTrueValue(True) && isHWFalseValue(False))) {
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
+  // XXX If True is a hardware TRUE value and False is a hardware FALSE value,
+  // we can handle this with a native instruction, but we need to swap true
+  // and false and change the conditional.
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+  }
+
+  // XXX Check if we can lower this to a SELECT or if it is supported by a native
+  // operation. (The code below does this but we don't have the Instruction
+  // selection patterns to do this yet.
+#if 0
+  if (isZero(LHS) || isZero(RHS)) {
+    SDValue Cond = (isZero(LHS) ? RHS : LHS);
+    bool SwapTF = false;
+    switch (CCOpcode) {
+    case ISD::SETOEQ:
+    case ISD::SETUEQ:
+    case ISD::SETEQ:
+      SwapTF = true;
+      // Fall through
+    case ISD::SETONE:
+    case ISD::SETUNE:
+    case ISD::SETNE:
+      // We can lower to select
+      if (SwapTF) {
+        Temp = True;
+        True = False;
+        False = Temp;
+      }
+      // CNDE
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
+    default:
+      // Supported by a native operation (CNDGE, CNDGT)
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+    }
+  }
+#endif
+
+  // If we make it this for it means we have no native instructions to handle
+  // this SELECT_CC, so we must lower it.
+  SDValue HWTrue, HWFalse;
+
+  if (VT == MVT::f32) {
+    HWTrue = DAG.getConstantFP(1.0f, VT);
+    HWFalse = DAG.getConstantFP(0.0f, VT);
+  } else if (VT == MVT::i32) {
+    HWTrue = DAG.getConstant(-1, VT);
+    HWFalse = DAG.getConstant(0, VT);
+  }
+  else {
+    assert(!"Unhandled value type in LowerSELECT_CC");
+  }
+
+  // Lower this unsupported SELECT_CC into a combination of two supported
+  // SELECT_CC operations.
+  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, HWTrue, HWFalse, CC);
+
+  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
+}
+
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  SmallVector<SDValue, 8> Results;
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = umulo(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, VT),
+                                                 DAG.getConstant(0, VT),
+                                                 ISD::SETGE);
+  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
+                                                  DAG.getConstant(0, VT),
+                                                  DAG.getConstant(-1, VT),
+                                                  DAG.getConstant(0, VT),
+                                                  ISD::SETGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+
+  DAG.ReplaceAllUsesWith(Op.getValue(0).getNode(), &Div);
+  DAG.ReplaceAllUsesWith(Op.getValue(1).getNode(), &Rem);
+
+  return Op;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const
+{
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isAllOnesValue();
+  }
+  return false;
+}
+
+bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const
+{
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isNullValue();
+  }
+  return false;
+}
+
+void AMDGPUTargetLowering::addLiveIn(MachineInstr * MI,
+    MachineFunction * MF, MachineRegisterInfo & MRI,
+    const TargetInstrInfo * TII, unsigned reg) const
+{
+  AMDGPU::utilAddLiveIn(MF, MRI, TII, reg, MI->getOperand(0).getReg());
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const
+{
+  switch (Opcode) {
+  default: return AMDILTargetLowering::getTargetNodeName(Opcode);
+
+  NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(FMAX)
+  NODE_NAME_CASE(SMAX)
+  NODE_NAME_CASE(UMAX)
+  NODE_NAME_CASE(FMIN)
+  NODE_NAME_CASE(SMIN)
+  NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(URECIP)
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
new file mode 100644 (file)
index 0000000..72342c9
--- /dev/null
@@ -0,0 +1,77 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the interface defintiion of the TargetLowering class
+// that is common to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUISELLOWERING_H
+#define AMDGPUISELLOWERING_H
+
+#include "AMDILISelLowering.h"
+
+namespace llvm {
+
+class AMDGPUTargetLowering : public AMDILTargetLowering
+{
+private:
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+
+protected:
+
+  /// addLiveIn - This functions adds reg to the live in list of the entry block
+  /// and emits a copy from reg to MI.getOperand(0).
+  ///
+  //  Some registers are loaded with values before the program
+  /// begins to execute.  The loading of these values is modeled with pseudo
+  /// instructions which are lowered using this function. 
+  void addLiveIn(MachineInstr * MI, MachineFunction * MF,
+                 MachineRegisterInfo & MRI, const TargetInstrInfo * TII,
+                unsigned reg) const;
+
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+public:
+  AMDGPUTargetLowering(TargetMachine &TM);
+
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
+  virtual const char* getTargetNodeName(unsigned Opcode) const;
+
+};
+
+namespace AMDGPUISD
+{
+
+enum
+{
+  AMDGPU_FIRST = AMDILISD::LAST_ISD_NUMBER,
+  BITALIGN,
+  FRACT,
+  FMAX,
+  SMAX,
+  UMAX,
+  FMIN,
+  SMIN,
+  UMIN,
+  URECIP,
+  LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+} // End namespace llvm
+
+#endif // AMDGPUISELLOWERING_H
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
new file mode 100644 (file)
index 0000000..3c1f063
--- /dev/null
@@ -0,0 +1,46 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the TargetInstrInfo class that is
+// common to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(AMDGPUTargetMachine &tm)
+  : AMDILInstrInfo(tm) { }
+
+void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const
+{
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AMDGPURegisterInfo & RI = getRegisterInfo();
+
+  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+    MachineOperand &MO = MI.getOperand(i);
+    // Convert dst regclass to one that is supported by the ISA
+    if (MO.isReg() && MO.isDef()) {
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
+        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
+
+        assert(newRegClass);
+
+        MRI.setRegClass(MO.getReg(), newRegClass);
+      }
+    }
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
new file mode 100644 (file)
index 0000000..b5d294b
--- /dev/null
@@ -0,0 +1,46 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definition of a TargetInstrInfo class that is common
+// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTRUCTIONINFO_H_
+#define AMDGPUINSTRUCTIONINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDILInstrInfo.h"
+
+#include <map>
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDILInstrInfo {
+
+public:
+  explicit AMDGPUInstrInfo(AMDGPUTargetMachine &tm);
+
+  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
+
+  /// convertToISA - Convert the AMDIL MachineInstr to a supported ISA
+  /// MachineInstr
+  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const;
+
+};
+
+} // End llvm namespace
+
+#endif // AMDGPUINSTRINFO_H_
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
new file mode 100644 (file)
index 0000000..4452719
--- /dev/null
@@ -0,0 +1,69 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+// out = ((a << 32) | b) >> c)
+//
+// Can be used to optimize rtol:
+// rotl(a, b) = bitalign(a, a, 32 - b)
+def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = max(a, b) a and b are floats
+def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats
+def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a snd b are signed ints
+def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
new file mode 100644 (file)
index 0000000..81b58c1
--- /dev/null
@@ -0,0 +1,123 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+  field bits<16> AMDILOp = 0;
+  field bits<3> Gen = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = NullALU;
+  let TSFlags{42-40} = Gen;
+  let TSFlags{63-48} = AMDILOp;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
+    : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+
+}
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+  (fpimm),
+  [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(1.0);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1  in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "CLAMP $dst, $src0",
+  [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FABS $dst, $src0",
+  [(set rc:$dst, (fabs rc:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FNEG $dst, $src0",
+  [(set rc:$dst, (fneg rc:$src0))]
+>;
+
+} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
+                  RegisterClass rc> : Pat <
+  (int_AMDGPU_pow rc:$src0, rc:$src1),
+  (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type,
+                     RegisterClass vec_class, int sub_idx, 
+                     SubRegIndex sub_reg>: Pat<
+  (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
+  (EXTRACT_SUBREG vec_class:$src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      RegisterClass elem_class, RegisterClass vec_class,
+                      int sub_idx, SubRegIndex sub_reg> : Pat <
+
+  (vec_type (vector_insert (vec_type vec_class:$vec),
+                           (elem_type elem_class:$elem), sub_idx)),
+  (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
+>;
+
+// Vector Build pattern
+class Vector_Build <ValueType vecType, RegisterClass elemClass> : Pat <
+  (IL_vbuild elemClass:$src),
+  (INSERT_SUBREG (vecType (IMPLICIT_DEF)), elemClass:$src, sel_x)
+>;
+
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+  (dt (bitconvert (st rc:$src0))),
+  (dt rc:$src0)
+>;
+
+include "R600Instructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
new file mode 100644 (file)
index 0000000..78f072c
--- /dev/null
@@ -0,0 +1,64 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+
+  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_cos : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_floor : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sin : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_ssg : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+}
+
+let TargetPrefix = "TGSI", isTarget = 1 in {
+
+  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[]>;
+}
+
+include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
new file mode 100644 (file)
index 0000000..ad48335
--- /dev/null
@@ -0,0 +1,24 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDILRegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
new file mode 100644 (file)
index 0000000..5863807
--- /dev/null
@@ -0,0 +1,42 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the TargetRegisterInfo interface that is implemented
+// by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUREGISTERINFO_H_
+#define AMDGPUREGISTERINFO_H_
+
+#include "AMDILRegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDILRegisterInfo
+{
+  AMDGPUTargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  AMDGPURegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const = 0;
+
+  /// getISARegClass - rc is an AMDIL reg class.  This function returns the
+  /// ISA reg class that is equivalent to the given AMDIL reg class.
+  virtual const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass * rc) const = 0;
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H_
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td
new file mode 100644 (file)
index 0000000..8181e02
--- /dev/null
@@ -0,0 +1,22 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+  def sel_x : SubRegIndex;
+  def sel_y : SubRegIndex;
+  def sel_z : SubRegIndex;
+  def sel_w : SubRegIndex;
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
new file mode 100644 (file)
index 0000000..96ace88
--- /dev/null
@@ -0,0 +1,36 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file declares the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGPUSUBTARGET_H_
+#define _AMDGPUSUBTARGET_H_
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget : public AMDILSubtarget
+{
+  InstrItineraryData InstrItins;
+
+public:
+  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
+    AMDILSubtarget(TT, CPU, FS)
+  {
+    InstrItins = getInstrItineraryForCPU(CPU);
+  }
+
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUSUBTARGET_H_
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
new file mode 100644 (file)
index 0000000..a581aca
--- /dev/null
@@ -0,0 +1,162 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The AMDGPU target machine contains all of the hardware specific information
+// needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeAMDGPUTarget() {
+  // Register the target
+  RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+}
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
+    StringRef CPU, StringRef FS,
+  TargetOptions Options,
+  Reloc::Model RM, CodeModel::Model CM,
+  CodeGenOpt::Level OptLevel
+)
+:
+  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+  Subtarget(TT, CPU, FS),
+  DataLayout(Subtarget.getDataLayout()),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+      Subtarget.device()->getStackAlignment(), 0),
+  IntrinsicInfo(this),
+  InstrItins(&Subtarget.getInstrItineraryData()),
+  mDump(false)
+
+{
+  // TLInfo uses InstrInfo so it must be initialized after.
+  if (Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    InstrInfo = new R600InstrInfo(*this);
+    TLInfo = new R600TargetLowering(*this);
+  } else {
+    InstrInfo = new SIInstrInfo(*this);
+    TLInfo = new SITargetLowering(*this);
+  }
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine()
+{
+}
+
+bool AMDGPUTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                              formatted_raw_ostream &Out,
+                                              CodeGenFileType FileType,
+                                              bool DisableVerify,
+                                              AnalysisID StartAfter,
+                                              AnalysisID StopAfter) {
+  // XXX: Hack here addPassesToEmitFile will fail, but this is Ok since we are
+  // only using it to access addPassesToGenerateCode()
+  bool fail = LLVMTargetMachine::addPassesToEmitFile(PM, Out, FileType,
+                                                     DisableVerify);
+  assert(fail);
+
+  const AMDILSubtarget &STM = getSubtarget<AMDILSubtarget>();
+  std::string gpu = STM.getDeviceName();
+  if (gpu == "SI") {
+    PM.add(createSICodeEmitterPass(Out));
+  } else if (Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    PM.add(createR600CodeEmitterPass(Out));
+  } else {
+    abort();
+    return true;
+  }
+  PM.add(createGCInfoDeleter());
+
+  return false;
+}
+
+namespace {
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // End of anonymous namespace
+
+TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AMDGPUPassConfig(this, PM);
+}
+
+bool
+AMDGPUPassConfig::addPreISel()
+{
+  const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
+  if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    addPass(createR600KernelParametersPass(
+                     getAMDGPUTargetMachine().getTargetData()));
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDILPeepholeOpt(*TM));
+  addPass(createAMDILISelDag(getAMDGPUTargetMachine()));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreRegAlloc() {
+  const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
+
+  if (ST.device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+    addPass(createSIAssignInterpRegsPass(*TM));
+  }
+  addPass(createAMDGPUConvertToISAPass(*TM));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPostRegAlloc() {
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreSched2() {
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreEmitPass() {
+  addPass(createAMDILCFGPreparationPass(*TM));
+  addPass(createAMDILCFGStructurizerPass(*TM));
+
+  return false;
+}
+
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
new file mode 100644 (file)
index 0000000..c704864
--- /dev/null
@@ -0,0 +1,76 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_TARGET_MACHINE_H
+#define AMDGPU_TARGET_MACHINE_H
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILFrameLowering.h"
+#include "AMDILIntrinsicInfo.h"
+#include "R600ISelLowering.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+
+  AMDGPUSubtarget Subtarget;
+  const TargetData DataLayout;
+  AMDILFrameLowering FrameLowering;
+  AMDILIntrinsicInfo IntrinsicInfo;
+  const AMDGPUInstrInfo * InstrInfo;
+  AMDGPUTargetLowering * TLInfo;
+  const InstrItineraryData* InstrItins;
+  bool mDump;
+
+public:
+   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                       StringRef CPU,
+                       TargetOptions Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+   ~AMDGPUTargetMachine();
+   virtual const AMDILFrameLowering* getFrameLowering() const {
+     return &FrameLowering;
+   }
+   virtual const AMDILIntrinsicInfo* getIntrinsicInfo() const {
+     return &IntrinsicInfo;
+   }
+   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
+   virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
+   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+      return &InstrInfo->getRegisterInfo();
+   }
+   virtual AMDGPUTargetLowering * getTargetLowering() const {
+      return TLInfo;
+   }
+   virtual const InstrItineraryData* getInstrItineraryData() const {
+      return InstrItins;
+   }
+   virtual const TargetData* getTargetData() const { return &DataLayout; }
+   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+   virtual bool addPassesToEmitFile(PassManagerBase &PM,
+                                              formatted_raw_ostream &Out,
+                                              CodeGenFileType FileType,
+                                              bool DisableVerify,
+                                              AnalysisID StartAfter = 0,
+                                              AnalysisID StopAfter = 0);
+};
+
+} // End namespace llvm
+
+#endif // AMDGPU_TARGET_MACHINE_H
diff --git a/lib/Target/AMDGPU/AMDGPUUtil.cpp b/lib/Target/AMDGPU/AMDGPUUtil.cpp
new file mode 100644 (file)
index 0000000..63b359f
--- /dev/null
@@ -0,0 +1,139 @@
+//===-- AMDGPUUtil.cpp - AMDGPU Utility functions -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common utility functions used by hw codegen targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUUtil.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool AMDGPU::isPlaceHolderOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  default: return false;
+  case AMDGPU::RETURN:
+  case AMDGPU::LOAD_INPUT:
+  case AMDGPU::LAST:
+  case AMDGPU::MASK_WRITE:
+  case AMDGPU::RESERVE_REG:
+    return true;
+  }
+}
+
+bool AMDGPU::isTransOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+
+    case AMDGPU::COS_r600:
+    case AMDGPU::COS_eg:
+    case AMDGPU::MULLIT:
+    case AMDGPU::MUL_LIT_r600:
+    case AMDGPU::MUL_LIT_eg:
+    case AMDGPU::EXP_IEEE_r600:
+    case AMDGPU::EXP_IEEE_eg:
+    case AMDGPU::LOG_CLAMPED_r600:
+    case AMDGPU::LOG_IEEE_r600:
+    case AMDGPU::LOG_CLAMPED_eg:
+    case AMDGPU::LOG_IEEE_eg:
+      return true;
+  }
+}
+
+bool AMDGPU::isTexOp(unsigned opcode)
+{
+  switch(opcode) {
+  default: return false;
+  case AMDGPU::TEX_LD:
+  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+  case AMDGPU::TEX_SAMPLE:
+  case AMDGPU::TEX_SAMPLE_C:
+  case AMDGPU::TEX_SAMPLE_L:
+  case AMDGPU::TEX_SAMPLE_C_L:
+  case AMDGPU::TEX_SAMPLE_LB:
+  case AMDGPU::TEX_SAMPLE_C_LB:
+  case AMDGPU::TEX_SAMPLE_G:
+  case AMDGPU::TEX_SAMPLE_C_G:
+  case AMDGPU::TEX_GET_GRADIENTS_H:
+  case AMDGPU::TEX_GET_GRADIENTS_V:
+  case AMDGPU::TEX_SET_GRADIENTS_H:
+  case AMDGPU::TEX_SET_GRADIENTS_V:
+    return true;
+  }
+}
+
+bool AMDGPU::isReductionOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+    case AMDGPU::DOT4_r600:
+    case AMDGPU::DOT4_eg:
+      return true;
+  }
+}
+
+bool AMDGPU::isCubeOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+    case AMDGPU::CUBE_r600:
+    case AMDGPU::CUBE_eg:
+      return true;
+  }
+}
+
+
+bool AMDGPU::isFCOp(unsigned opcode)
+{
+  switch(opcode) {
+  default: return false;
+  case AMDGPU::BREAK_LOGICALZ_f32:
+  case AMDGPU::BREAK_LOGICALNZ_i32:
+  case AMDGPU::BREAK_LOGICALZ_i32:
+  case AMDGPU::BREAK_LOGICALNZ_f32:
+  case AMDGPU::CONTINUE_LOGICALNZ_f32:
+  case AMDGPU::IF_LOGICALNZ_i32:
+  case AMDGPU::IF_LOGICALZ_f32:
+  case AMDGPU::ELSE:
+  case AMDGPU::ENDIF:
+  case AMDGPU::ENDLOOP:
+  case AMDGPU::IF_LOGICALNZ_f32:
+  case AMDGPU::WHILELOOP:
+    return true;
+  }
+}
+
+void AMDGPU::utilAddLiveIn(MachineFunction * MF,
+                           MachineRegisterInfo & MRI,
+                           const TargetInstrInfo * TII,
+                           unsigned physReg, unsigned virtReg)
+{
+    if (!MRI.isLiveIn(physReg)) {
+      MRI.addLiveIn(physReg, virtReg);
+      MF->front().addLiveIn(physReg);
+      BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), virtReg)
+                .addReg(physReg);
+    } else {
+      MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
+    }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUUtil.h b/lib/Target/AMDGPU/AMDGPUUtil.h
new file mode 100644 (file)
index 0000000..e8b02b1
--- /dev/null
@@ -0,0 +1,46 @@
+//===-- AMDGPUUtil.h - AMDGPU Utility function declarations -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for utility functions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_UTIL_H
+#define AMDGPU_UTIL_H
+
+namespace llvm {
+
+class MachineFunction;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+namespace AMDGPU {
+
+bool isPlaceHolderOpcode(unsigned opcode);
+
+bool isTransOp(unsigned opcode);
+bool isTexOp(unsigned opcode);
+bool isReductionOp(unsigned opcode);
+bool isCubeOp(unsigned opcode);
+bool isFCOp(unsigned opcode);
+
+// XXX: Move these to AMDGPUInstrInfo.h
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+
+void utilAddLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
+    const TargetInstrInfo * TII, unsigned physReg, unsigned virtReg);
+
+} // End namespace AMDGPU
+
+} // End namespace llvm
+
+#endif // AMDGPU_UTIL_H
diff --git a/lib/Target/AMDGPU/AMDIL.h b/lib/Target/AMDGPU/AMDIL.h
new file mode 100644 (file)
index 0000000..4029f27
--- /dev/null
@@ -0,0 +1,251 @@
+//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AMDIL back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDIL_H_
+#define AMDIL_H_
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define AMDIL_MAJOR_VERSION 2
+#define AMDIL_MINOR_VERSION 0
+#define AMDIL_REVISION_NUMBER 74
+#define ARENA_SEGMENT_RESERVED_UAVS 12
+#define DEFAULT_ARENA_UAV_ID 8
+#define DEFAULT_RAW_UAV_ID 7
+#define GLOBAL_RETURN_RAW_UAV_ID 11
+#define HW_MAX_NUM_CB 8
+#define MAX_NUM_UNIQUE_UAVS 8
+#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
+#define OPENCL_MAX_READ_IMAGES 128
+#define OPENCL_MAX_WRITE_IMAGES 8
+#define OPENCL_MAX_SAMPLERS 16
+
+// The next two values can never be zero, as zero is the ID that is
+// used to assert against.
+#define DEFAULT_LDS_ID     1
+#define DEFAULT_GDS_ID     1
+#define DEFAULT_SCRATCH_ID 1
+#define DEFAULT_VEC_SLOTS  8
+
+// SC->CAL version matchings.
+#define CAL_VERSION_SC_150               1700
+#define CAL_VERSION_SC_149               1700
+#define CAL_VERSION_SC_148               1525
+#define CAL_VERSION_SC_147               1525
+#define CAL_VERSION_SC_146               1525
+#define CAL_VERSION_SC_145               1451
+#define CAL_VERSION_SC_144               1451
+#define CAL_VERSION_SC_143               1441
+#define CAL_VERSION_SC_142               1441
+#define CAL_VERSION_SC_141               1420
+#define CAL_VERSION_SC_140               1400
+#define CAL_VERSION_SC_139               1387
+#define CAL_VERSION_SC_138               1387
+#define CAL_APPEND_BUFFER_SUPPORT        1340
+#define CAL_VERSION_SC_137               1331
+#define CAL_VERSION_SC_136                982
+#define CAL_VERSION_SC_135                950
+#define CAL_VERSION_GLOBAL_RETURN_BUFFER  990
+
+#define OCL_DEVICE_RV710        0x0001
+#define OCL_DEVICE_RV730        0x0002
+#define OCL_DEVICE_RV770        0x0004
+#define OCL_DEVICE_CEDAR        0x0008
+#define OCL_DEVICE_REDWOOD      0x0010
+#define OCL_DEVICE_JUNIPER      0x0020
+#define OCL_DEVICE_CYPRESS      0x0040
+#define OCL_DEVICE_CAICOS       0x0080
+#define OCL_DEVICE_TURKS        0x0100
+#define OCL_DEVICE_BARTS        0x0200
+#define OCL_DEVICE_CAYMAN       0x0400
+#define OCL_DEVICE_ALL          0x3FFF
+
+/// The number of function ID's that are reserved for 
+/// internal compiler usage.
+const unsigned int RESERVED_FUNCS = 1024;
+
+#define AMDIL_OPT_LEVEL_DECL
+#define  AMDIL_OPT_LEVEL_VAR
+#define AMDIL_OPT_LEVEL_VAR_NO_COMMA
+
+namespace llvm {
+class AMDILInstrPrinter;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+/// Instruction selection passes.
+FunctionPass*
+  createAMDILISelDag(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILPeepholeOpt(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+
+/// Pre emit passes.
+FunctionPass*
+  createAMDILCFGPreparationPass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILCFGStructurizerPass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+
+extern Target TheAMDILTarget;
+extern Target TheAMDGPUTarget;
+} // end namespace llvm;
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+/// Include device information enumerations
+#include "AMDILDeviceInfo.h"
+
+namespace llvm {
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a seperate piece of memory that is unique from other
+/// memory locations.
+namespace AMDILAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, // Address space for private memory.
+  GLOBAL_ADDRESS   = 1, // Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, // Address space for constant memory.
+  LOCAL_ADDRESS    = 3, // Address space for local memory.
+  REGION_ADDRESS   = 4, // Address space for region memory.
+  ADDRESS_NONE     = 5, // Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, // Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, // Address space for indirect addressible parameter memory (VTX1)
+  USER_SGPR_ADDRESS = 8, // Address space for USER_SGPRS on SI
+  LAST_ADDRESS     = 9
+};
+
+// This union/struct combination is an easy way to read out the
+// exact bits that are needed.
+typedef union ResourceRec {
+  struct {
+#ifdef __BIG_ENDIAN__
+    unsigned short isImage       : 1;  // Reserved for future use/llvm.
+    unsigned short ResourceID    : 10; // Flag to specify the resourece ID for
+                                       // the op.
+    unsigned short HardwareInst  : 1;  // Flag to specify that this instruction
+                                       // is a hardware instruction.
+    unsigned short ConflictPtr   : 1;  // Flag to specify that the pointer has a
+                                       // conflict.
+    unsigned short ByteStore     : 1;  // Flag to specify if the op is a byte
+                                       // store op.
+    unsigned short PointerPath   : 1;  // Flag to specify if the op is on the
+                                       // pointer path.
+    unsigned short CacheableRead : 1;  // Flag to specify if the read is
+                                       // cacheable.
+#else
+    unsigned short CacheableRead : 1;  // Flag to specify if the read is
+                                       // cacheable.
+    unsigned short PointerPath   : 1;  // Flag to specify if the op is on the
+                                       // pointer path.
+    unsigned short ByteStore     : 1;  // Flag to specify if the op is byte
+                                       // store op.
+    unsigned short ConflictPtr   : 1;  // Flag to specify that the pointer has
+                                       // a conflict.
+    unsigned short HardwareInst  : 1;  // Flag to specify that this instruction
+                                       // is a hardware instruction.
+    unsigned short ResourceID    : 10; // Flag to specify the resource ID for
+                                       // the op.
+    unsigned short isImage       : 1;  // Reserved for future use.
+#endif
+  } bits;
+  unsigned short u16all;
+} InstrResEnc;
+
+} // namespace AMDILAS
+
+// Enums corresponding to AMDIL condition codes for IL.  These
+// values must be kept in sync with the ones in the .td file.
+namespace AMDILCC {
+enum CondCodes {
+  // AMDIL specific condition codes. These correspond to the IL_CC_*
+  // in AMDILInstrInfo.td and must be kept in the same order.
+  IL_CC_D_EQ  =  0,   // DEQ instruction.
+  IL_CC_D_GE  =  1,   // DGE instruction.
+  IL_CC_D_LT  =  2,   // DLT instruction.
+  IL_CC_D_NE  =  3,   // DNE instruction.
+  IL_CC_F_EQ  =  4,   //  EQ instruction.
+  IL_CC_F_GE  =  5,   //  GE instruction.
+  IL_CC_F_LT  =  6,   //  LT instruction.
+  IL_CC_F_NE  =  7,   //  NE instruction.
+  IL_CC_I_EQ  =  8,   // IEQ instruction.
+  IL_CC_I_GE  =  9,   // IGE instruction.
+  IL_CC_I_LT  = 10,   // ILT instruction.
+  IL_CC_I_NE  = 11,   // INE instruction.
+  IL_CC_U_GE  = 12,   // UGE instruction.
+  IL_CC_U_LT  = 13,   // ULE instruction.
+  // Pseudo IL Comparison instructions here.
+  IL_CC_F_GT  = 14,   //  GT instruction.
+  IL_CC_U_GT  = 15,
+  IL_CC_I_GT  = 16,
+  IL_CC_D_GT  = 17,
+  IL_CC_F_LE  = 18,   //  LE instruction
+  IL_CC_U_LE  = 19,
+  IL_CC_I_LE  = 20,
+  IL_CC_D_LE  = 21,
+  IL_CC_F_UNE = 22,
+  IL_CC_F_UEQ = 23,
+  IL_CC_F_ULT = 24,
+  IL_CC_F_UGT = 25,
+  IL_CC_F_ULE = 26,
+  IL_CC_F_UGE = 27,
+  IL_CC_F_ONE = 28,
+  IL_CC_F_OEQ = 29,
+  IL_CC_F_OLT = 30,
+  IL_CC_F_OGT = 31,
+  IL_CC_F_OLE = 32,
+  IL_CC_F_OGE = 33,
+  IL_CC_D_UNE = 34,
+  IL_CC_D_UEQ = 35,
+  IL_CC_D_ULT = 36,
+  IL_CC_D_UGT = 37,
+  IL_CC_D_ULE = 38,
+  IL_CC_D_UGE = 39,
+  IL_CC_D_ONE = 40,
+  IL_CC_D_OEQ = 41,
+  IL_CC_D_OLT = 42,
+  IL_CC_D_OGT = 43,
+  IL_CC_D_OLE = 44,
+  IL_CC_D_OGE = 45,
+  IL_CC_U_EQ  = 46,
+  IL_CC_U_NE  = 47,
+  IL_CC_F_O   = 48,
+  IL_CC_D_O   = 49,
+  IL_CC_F_UO  = 50,
+  IL_CC_D_UO  = 51,
+  IL_CC_L_LE  = 52,
+  IL_CC_L_GE  = 53,
+  IL_CC_L_EQ  = 54,
+  IL_CC_L_NE  = 55,
+  IL_CC_L_LT  = 56,
+  IL_CC_L_GT  = 57,
+  IL_CC_UL_LE = 58,
+  IL_CC_UL_GE = 59,
+  IL_CC_UL_EQ = 60,
+  IL_CC_UL_NE = 61,
+  IL_CC_UL_LT = 62,
+  IL_CC_UL_GT = 63,
+  COND_ERROR  = 64
+};
+
+} // end namespace AMDILCC
+} // end namespace llvm
+#endif // AMDIL_H_
diff --git a/lib/Target/AMDGPU/AMDIL7XXDevice.cpp b/lib/Target/AMDGPU/AMDIL7XXDevice.cpp
new file mode 100644 (file)
index 0000000..3f2f821
--- /dev/null
@@ -0,0 +1,128 @@
+//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDIL7XXDevice.h"
+#include "AMDILDevice.h"
+
+using namespace llvm;
+
+AMDIL7XXDevice::AMDIL7XXDevice(AMDILSubtarget *ST) : AMDILDevice(ST)
+{
+  setCaps();
+  std::string name = mSTM->getDeviceName();
+  if (name == "rv710") {
+    mDeviceFlag = OCL_DEVICE_RV710;
+  } else if (name == "rv730") {
+    mDeviceFlag = OCL_DEVICE_RV730;
+  } else {
+    mDeviceFlag = OCL_DEVICE_RV770;
+  }
+}
+
+AMDIL7XXDevice::~AMDIL7XXDevice()
+{
+}
+
+void AMDIL7XXDevice::setCaps()
+{
+  mSWBits.set(AMDILDeviceInfo::LocalMem);
+}
+
+size_t AMDIL7XXDevice::getMaxLDSSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_700;
+  }
+  return 0;
+}
+
+size_t AMDIL7XXDevice::getWavefrontSize() const
+{
+  return AMDILDevice::HalfWavefrontSize;
+}
+
+uint32_t AMDIL7XXDevice::getGeneration() const
+{
+  return AMDILDeviceInfo::HD4XXX;
+}
+
+uint32_t AMDIL7XXDevice::getResourceID(uint32_t DeviceID) const
+{
+  switch (DeviceID) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case GLOBAL_ID:
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+  case ARENA_UAV_ID:
+    break;
+  case LDS_ID:
+    if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    }
+    break;
+  case SCRATCH_ID:
+    if (usesHardware(AMDILDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    }
+    break;
+  case GDS_ID:
+    assert(0 && "GDS UAV ID is not supported on this chip");
+    if (usesHardware(AMDILDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    }
+    break;
+  };
+
+  return 0;
+}
+
+uint32_t AMDIL7XXDevice::getMaxNumUAVs() const
+{
+  return 1;
+}
+
+AMDIL770Device::AMDIL770Device(AMDILSubtarget *ST): AMDIL7XXDevice(ST)
+{
+  setCaps();
+}
+
+AMDIL770Device::~AMDIL770Device()
+{
+}
+
+void AMDIL770Device::setCaps()
+{
+  if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) {
+    mSWBits.set(AMDILDeviceInfo::FMA);
+    mHWBits.set(AMDILDeviceInfo::DoubleOps);
+  }
+  mSWBits.set(AMDILDeviceInfo::BarrierDetect);
+  mHWBits.reset(AMDILDeviceInfo::LongOps);
+  mSWBits.set(AMDILDeviceInfo::LongOps);
+  mSWBits.set(AMDILDeviceInfo::LocalMem);
+}
+
+size_t AMDIL770Device::getWavefrontSize() const
+{
+  return AMDILDevice::WavefrontSize;
+}
+
+AMDIL710Device::AMDIL710Device(AMDILSubtarget *ST) : AMDIL7XXDevice(ST)
+{
+}
+
+AMDIL710Device::~AMDIL710Device()
+{
+}
+
+size_t AMDIL710Device::getWavefrontSize() const
+{
+  return AMDILDevice::QuarterWavefrontSize;
+}
diff --git a/lib/Target/AMDGPU/AMDIL7XXDevice.h b/lib/Target/AMDGPU/AMDIL7XXDevice.h
new file mode 100644 (file)
index 0000000..4d8d47a
--- /dev/null
@@ -0,0 +1,71 @@
+//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===----------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDIL7XXDEVICEIMPL_H_
+#define _AMDIL7XXDEVICEIMPL_H_
+#include "AMDILDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+class AMDILSubtarget;
+
+//===----------------------------------------------------------------------===//
+// 7XX generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+// The AMDIL7XXDevice class represents the generic 7XX device. All 7XX
+// devices are derived from this class. The AMDIL7XX device will only
+// support the minimal features that are required to be considered OpenCL 1.0
+// compliant and nothing more.
+class AMDIL7XXDevice : public AMDILDevice {
+public:
+  AMDIL7XXDevice(AMDILSubtarget *ST);
+  virtual ~AMDIL7XXDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getResourceID(uint32_t DeviceID) const;
+  virtual uint32_t getMaxNumUAVs() const;
+
+protected:
+  virtual void setCaps();
+}; // AMDIL7XXDevice
+
+// The AMDIL770Device class represents the RV770 chip and it's
+// derivative cards. The difference between this device and the base
+// class is this device device adds support for double precision
+// and has a larger wavefront size.
+class AMDIL770Device : public AMDIL7XXDevice {
+public:
+  AMDIL770Device(AMDILSubtarget *ST);
+  virtual ~AMDIL770Device();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+}; // AMDIL770Device
+
+// The AMDIL710Device class derives from the 7XX base class, but this
+// class is a smaller derivative, so we need to overload some of the
+// functions in order to correctly specify this information.
+class AMDIL710Device : public AMDIL7XXDevice {
+public:
+  AMDIL710Device(AMDILSubtarget *ST);
+  virtual ~AMDIL710Device();
+  virtual size_t getWavefrontSize() const;
+}; // AMDIL710Device
+
+} // namespace llvm
+#endif // _AMDILDEVICEIMPL_H_
diff --git a/lib/Target/AMDGPU/AMDILAlgorithms.tpp b/lib/Target/AMDGPU/AMDILAlgorithms.tpp
new file mode 100644 (file)
index 0000000..058475f
--- /dev/null
@@ -0,0 +1,93 @@
+//===------ AMDILAlgorithms.tpp - AMDIL Template Algorithms Header --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides templates algorithms that extend the STL algorithms, but
+// are useful for the AMDIL backend
+//
+//===----------------------------------------------------------------------===//
+
+// A template function that loops through the iterators and passes the second
+// argument along with each iterator to the function. If the function returns
+// true, then the current iterator is invalidated and it moves back, before
+// moving forward to the next iterator, otherwise it moves forward without
+// issue. This is based on the for_each STL function, but allows a reference to
+// the second argument
+template<class InputIterator, class Function, typename Arg>
+Function binaryForEach(InputIterator First, InputIterator Last, Function F,
+                       Arg &Second)
+{
+  for ( ; First!=Last; ++First ) {
+    F(*First, Second);
+  }
+  return F;
+}
+
+template<class InputIterator, class Function, typename Arg>
+Function safeBinaryForEach(InputIterator First, InputIterator Last, Function F,
+                           Arg &Second)
+{
+  for ( ; First!=Last; ++First ) {
+    if (F(*First, Second)) {
+      --First;
+    }
+  }
+  return F;
+}
+
+// A template function that has two levels of looping before calling the
+// function with the passed in argument. See binaryForEach for further
+// explanation
+template<class InputIterator, class Function, typename Arg>
+Function binaryNestedForEach(InputIterator First, InputIterator Last,
+                             Function F, Arg &Second)
+{
+  for ( ; First != Last; ++First) {
+    binaryForEach(First->begin(), First->end(), F, Second);
+  }
+  return F;
+}
+template<class InputIterator, class Function, typename Arg>
+Function safeBinaryNestedForEach(InputIterator First, InputIterator Last,
+                                 Function F, Arg &Second)
+{
+  for ( ; First != Last; ++First) {
+    safeBinaryForEach(First->begin(), First->end(), F, Second);
+  }
+  return F;
+}
+
+// Unlike the STL, a pointer to the iterator itself is passed in with the 'safe'
+// versions of these functions This allows the function to handle situations
+// such as invalidated iterators
+template<class InputIterator, class Function>
+Function safeForEach(InputIterator First, InputIterator Last, Function F)
+{
+  for ( ; First!=Last; ++First )  F(&First)
+    ; // Do nothing.
+  return F;
+}
+
+// A template function that has two levels of looping before calling the
+// function with a pointer to the current iterator. See binaryForEach for
+// further explanation
+template<class InputIterator, class SecondIterator, class Function>
+Function safeNestedForEach(InputIterator First, InputIterator Last,
+                              SecondIterator S, Function F)
+{
+  for ( ; First != Last; ++First) {
+    SecondIterator sf, sl;
+    for (sf = First->begin(), sl = First->end();
+         sf != sl; )  {
+      if (!F(&sf)) {
+        ++sf;
+      } 
+    }
+  }
+  return F;
+}
diff --git a/lib/Target/AMDGPU/AMDILBase.td b/lib/Target/AMDGPU/AMDILBase.td
new file mode 100644 (file)
index 0000000..8a2d34a
--- /dev/null
@@ -0,0 +1,113 @@
+//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// AMDIL Subtarget features.
+//===----------------------------------------------------------------------===//
+def FeatureFP64     : SubtargetFeature<"fp64",
+        "CapsOverride[AMDILDeviceInfo::DoubleOps]",
+        "true",
+        "Enable 64bit double precision operations">;
+def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
+        "CapsOverride[AMDILDeviceInfo::ByteStores]",
+        "true",
+        "Enable byte addressable stores">;
+def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
+        "CapsOverride[AMDILDeviceInfo::BarrierDetect]",
+        "true",
+        "Enable duplicate barrier detection(HD5XXX or later).">;
+def FeatureImages : SubtargetFeature<"images",
+        "CapsOverride[AMDILDeviceInfo::Images]",
+        "true",
+        "Enable image functions">;
+def FeatureMultiUAV : SubtargetFeature<"multi_uav",
+        "CapsOverride[AMDILDeviceInfo::MultiUAV]",
+        "true",
+        "Generate multiple UAV code(HD5XXX family or later)">;
+def FeatureMacroDB : SubtargetFeature<"macrodb",
+        "CapsOverride[AMDILDeviceInfo::MacroDB]",
+        "true",
+        "Use internal macrodb, instead of macrodb in driver">;
+def FeatureNoAlias : SubtargetFeature<"noalias",
+        "CapsOverride[AMDILDeviceInfo::NoAlias]",
+        "true",
+        "assert that all kernel argument pointers are not aliased">;
+def FeatureNoInline : SubtargetFeature<"no-inline",
+        "CapsOverride[AMDILDeviceInfo::NoInline]",
+        "true",
+        "specify whether to not inline functions">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "mIs64bit",
+        "false",
+        "Specify if 64bit addressing should be used.">;
+
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
+        "mIs32on64bit",
+        "false",
+        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+def FeatureDebug : SubtargetFeature<"debug",
+        "CapsOverride[AMDILDeviceInfo::Debug]",
+        "true",
+        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "mDumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+
+include "AMDILRegisterInfo.td"
+include "AMDILCallingConv.td"
+include "AMDILInstrInfo.td"
+
+def AMDILInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// AMDIL processors supported.
+//===----------------------------------------------------------------------===//
+//include "Processors.td"
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+def AMDILAsmWriter : AsmWriter {
+    string AsmWriterClassName = "AsmPrinter";
+    int Variant = 0;
+}
+
+def AMDILAsmParser : AsmParser {
+    string AsmParserClassName = "AsmParser";
+    int Variant = 0;
+
+    string CommentDelimiter = ";";
+
+    string RegisterPrefix = "r";
+
+}
+
+
+def AMDIL : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDILInstrInfo;
+  let AssemblyWriters = [AMDILAsmWriter];
+  let AssemblyParsers = [AMDILAsmParser];
+}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
new file mode 100644 (file)
index 0000000..1f1a6da
--- /dev/null
@@ -0,0 +1,3236 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUGME 0
+#define DEBUG_TYPE "structcfg"
+
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "AMDILRegisterInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define FirstNonDebugInstr(A) A->begin()
+using namespace llvm;
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
+    "pattern matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace llvmCFGStruct
+{
+#define SHOWNEWINSTR(i) \
+  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+
+#define SHOWNEWBLK(b, msg) \
+if (DEBUGME) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  errs() << "\n"; \
+}
+
+#define SHOWBLK_DETAIL(b, msg) \
+if (DEBUGME) { \
+  if (b) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(errs()); \
+  errs() << "\n"; \
+  } \
+}
+
+#define INVALIDSCCNUM -1
+#define INVALIDREGNUM 0
+
+template<class LoopinfoT>
+void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
+  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
+       iterEnd = LoopInfo.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->print(OS, 0);
+  }
+}
+
+template<class NodeT>
+void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} //end namespace llvmCFGStruct
+
+
+//===----------------------------------------------------------------------===//
+//
+// MachinePostDominatorTree
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
+/// to compute the a post-dominator tree.
+///
+struct MachinePostDominatorTree : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DominatorTreeBase<MachineBasicBlock> *DT;
+  MachinePostDominatorTree() : MachineFunctionPass(ID)
+  {
+    DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+    // postdominator
+  }
+
+  ~MachinePostDominatorTree();
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  inline const std::vector<MachineBasicBlock *> &getRoots() const {
+    return DT->getRoots();
+  }
+
+  inline MachineDomTreeNode *getRootNode() const {
+    return DT->getRootNode();
+  }
+
+  inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  inline bool dominates(MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->dominates(A, B);
+  }
+
+  inline bool dominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->dominates(A, B);
+  }
+
+  inline bool
+  properlyDominates(const MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  inline bool
+  properlyDominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  inline MachineBasicBlock *
+  findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B) {
+    return DT->findNearestCommonDominator(A, B);
+  }
+
+  virtual void print(llvm::raw_ostream &OS, const Module *M = 0) const {
+    DT->print(OS);
+  }
+};
+} //end of namespace llvm
+
+char MachinePostDominatorTree::ID = 0;
+static RegisterPass<MachinePostDominatorTree>
+machinePostDominatorTreePass("machinepostdomtree",
+                             "MachinePostDominator Tree Construction",
+                             true, true);
+
+//const PassInfo *const llvm::MachinePostDominatorsID
+//= &machinePostDominatorTreePass;
+
+bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  //DEBUG(DT->dump());
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+template<class PassT>
+struct CFGStructTraits {
+};
+
+template <class InstrT>
+class BlockInformation {
+public:
+  bool isRetired;
+  int  sccNum;
+  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
+  //Instructions defining the corresponding successor.
+  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
+};
+
+template <class BlockT, class InstrT, class RegiT>
+class LandInformation {
+public:
+  BlockT *landBlk;
+  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
+                                  //WHILELOOP(thisloop) init before entering
+                                  //thisloop.
+  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
+                                  //WHILELOOP(thisloop) init after entering
+                                  //thisloop.
+  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
+                                     //land block, branch cond on this reg.
+  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
+                                     //endif" after ENDLOOP(thisloop) break
+                                     //outerLoopOf(thisLoop).
+  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
+                                    //endif" after ENDLOOP(thisloop) continue on
+                                    //outerLoopOf(thisLoop).
+  LandInformation() : landBlk(NULL) {}
+};
+
+} //end of namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
+template<class PassT>
+class  CFGStructurizer
+{
+public:
+  typedef enum {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  } PathToKind;
+
+public:
+  typedef typename PassT::InstructionType         InstrT;
+  typedef typename PassT::FunctionType            FuncT;
+  typedef typename PassT::DominatortreeType       DomTreeT;
+  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
+  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
+  typedef typename PassT::LoopinfoType            LoopInfoT;
+
+  typedef GraphTraits<FuncT *>                    FuncGTraits;
+  //typedef FuncGTraits::nodes_iterator BlockIterator;
+  typedef typename FuncT::iterator                BlockIterator;
+
+  typedef typename FuncGTraits::NodeType          BlockT;
+  typedef GraphTraits<BlockT *>                   BlockGTraits;
+  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
+  //typedef BlockGTraits::succ_iterator InstructionIterator;
+  typedef typename BlockT::iterator               InstrIterator;
+
+  typedef CFGStructTraits<PassT>                  CFGTraits;
+  typedef BlockInformation<InstrT>                BlockInfo;
+  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
+
+  typedef int                                     RegiT;
+  typedef typename PassT::LoopType                LoopT;
+  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
+        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
+        //landing info for loop break
+  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+
+public:
+  CFGStructurizer();
+  ~CFGStructurizer();
+
+  /// Perform the CFG structurization
+  bool run(FuncT &Func, PassT &Pass, const AMDILRegisterInfo *tri);
+
+  /// Perform the CFG preparation
+  bool prepare(FuncT &Func, PassT &Pass, const AMDILRegisterInfo *tri);
+
+private:
+  void   orderBlocks();
+  void   printOrderedBlocks(llvm::raw_ostream &OS);
+  int patternMatch(BlockT *CurBlock);
+  int patternMatchGroup(BlockT *CurBlock);
+
+  int serialPatternMatch(BlockT *CurBlock);
+  int ifPatternMatch(BlockT *CurBlock);
+  int switchPatternMatch(BlockT *CurBlock);
+  int loopendPatternMatch(BlockT *CurBlock);
+  int loopPatternMatch(BlockT *CurBlock);
+
+  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  //int loopWithoutBreak(BlockT *);
+
+  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
+                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
+  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
+                           BlockT *ContBlock, LoopT *contLoop);
+  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
+  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                       BlockT *FalseBlock);
+  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
+                          BlockT *FalseBlock);
+  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                              BlockT *FalseBlock, BlockT **LandBlockPtr);
+  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                                   BlockT *FalseBlock, BlockT *LandBlock,
+                                   bool Detail = false);
+  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
+                          bool AllowSideEntry = true);
+  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
+                        bool AllowSideEntry = true);
+  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
+  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
+
+  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
+                            BlockT *TrueBlock, BlockT *FalseBlock,
+                            BlockT *LandBlock);
+  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
+  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
+                           BlockT *ExitLandBlock, RegiT SetReg);
+  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
+                           RegiT SetReg);
+  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
+                                std::set<BlockT*> &ExitBlockSet,
+                                BlockT *ExitLandBlk);
+  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
+                                BlockTSmallerVector &ExitingBlocks,
+                                BlockTSmallerVector &ExitBlocks);
+  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
+  void removeUnconditionalBranch(BlockT *SrcBlock);
+  void removeRedundantConditionalBranch(BlockT *SrcBlock);
+  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
+
+  void removeSuccessor(BlockT *SrcBlock);
+  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
+  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
+
+  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
+                          InstrIterator InsertPos);
+
+  void recordSccnum(BlockT *SrcBlock, int SCCNum);
+  int getSCCNum(BlockT *srcBlk);
+
+  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
+  bool isRetiredBlock(BlockT *SrcBlock);
+  bool isActiveLoophead(BlockT *CurBlock);
+  bool needMigrateBlock(BlockT *Block);
+
+  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
+                              BlockTSmallerVector &exitBlocks,
+                              std::set<BlockT*> &ExitBlockSet);
+  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
+  BlockT *getLoopLandBlock(LoopT *LoopRep);
+  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
+
+  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
+
+  bool hasBackEdge(BlockT *curBlock);
+  unsigned getLoopDepth  (LoopT *LoopRep);
+  int countActiveBlock(
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
+    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
+  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+
+private:
+  DomTreeT *domTree;
+  PostDomTreeT *postDomTree;
+  LoopInfoT *loopInfo;
+  PassT *passRep;
+  FuncT *funcRep;
+
+  BlockInfoMap blockInfoMap;
+  LoopLandInfoMap loopLandInfoMap;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
+  const AMDILRegisterInfo *TRI;
+
+};  //template class CFGStructurizer
+
+template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
+  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+}
+
+template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
+  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
+       E = blockInfoMap.end(); I != E; ++I) {
+    delete I->second;
+  }
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
+                                     const AMDILRegisterInfo * tri) {
+  passRep = &pass;
+  funcRep = &func;
+  TRI = tri;
+
+  bool changed = false;
+  //func.RenumberBlocks();
+
+  //to do, if not reducible flow graph, make it so ???
+
+  if (DEBUGME) {
+        errs() << "AMDILCFGStructurizer::prepare\n";
+    //func.viewCFG();
+    //func.viewCFGOnly();
+    //func.dump();
+  }
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+      //domTree = CFGTraits::getDominatorTree(pass);
+      //if (DEBUGME) {
+      //    domTree->print(errs());
+    //}
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+      //postDomTree = CFGTraits::getPostDominatorTree(pass);
+      //if (DEBUGME) {
+      //   postDomTree->print(errs());
+    //}
+
+  //FIXME: gcc complains on this.
+  //loopInfo = &pass.getAnalysis<LoopInfoT>();
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
+
+  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
+       iterEnd = loopInfo->end();
+       iter != iterEnd; ++iter) {
+    LoopT* loopRep = (*iter);
+    BlockTSmallerVector exitingBlks;
+    loopRep->getExitingBlocks(exitingBlks);
+    
+    if (exitingBlks.size() == 0) {
+      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
+      if (dummyExitBlk != NULL)
+        retBlks.push_back(dummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
+       iterBlk != iterEndBlk;
+       ++iterBlk) {
+    BlockT *curBlk = *iterBlk;
+    removeUnconditionalBranch(curBlk);
+    removeRedundantConditionalBranch(curBlk);
+    if (CFGTraits::isReturnBlock(curBlk)) {
+      retBlks.push_back(curBlk);
+    }
+    assert(curBlk->succ_size() <= 2);
+    //assert(curBlk->size() > 0);
+    //removeEmptyBlock(curBlk) ??
+  } //for
+
+  if (retBlks.size() >= 2) {
+    addDummyExitBlock(retBlks);
+    changed = true;
+  }
+
+  return changed;
+} //CFGStructurizer::prepare
+
+template<class PassT>
+bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
+    const AMDILRegisterInfo * tri) {
+  passRep = &pass;
+  funcRep = &func;
+  TRI = tri;
+
+  //func.RenumberBlocks();
+
+  //Assume reducible CFG...
+  if (DEBUGME) {
+    errs() << "AMDILCFGStructurizer::run\n";
+    //errs() << func.getFunction()->getNameStr() << "\n";
+    func.viewCFG();
+    //func.viewCFGOnly();
+    //func.dump();
+  }
+
+#if 1
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+  domTree = CFGTraits::getDominatorTree(pass);
+  if (DEBUGME) {
+    domTree->print(errs(), (const llvm::Module*)0);
+  }
+#endif
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+  postDomTree = CFGTraits::getPostDominatorTree(pass);
+  if (DEBUGME) {
+    postDomTree->print(errs());
+  }
+
+  //FIXME: gcc complains on this.
+  //loopInfo = &pass.getAnalysis<LoopInfoT>();
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+//#define STRESSTEST
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+  int numIter = 0;
+  bool finish = false;
+  BlockT *curBlk;
+  bool makeProgress = false;
+  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
+                                        orderedBlks.end());
+
+  do {
+    ++numIter;
+    if (DEBUGME) {
+      errs() << "numIter = " << numIter
+             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
+    }
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin();
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlkEnd = orderedBlks.end();
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      sccBeginIter = iterBlk;
+    BlockT *sccBeginBlk = NULL;
+    int sccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int sccNumIter;     // Number of iteration in this SCC.
+
+    while (iterBlk != iterBlkEnd) {
+      curBlk = *iterBlk;
+
+      if (sccBeginBlk == NULL) {
+        sccBeginIter = iterBlk;
+        sccBeginBlk = curBlk;
+        sccNumIter = 0;
+        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
+        if (DEBUGME) {
+              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
+              errs() << "\n";
+        }
+      }
+
+      if (!isRetiredBlock(curBlk)) {
+        patternMatch(curBlk);
+      }
+
+      ++iterBlk;
+
+      bool contNextScc = true;
+      if (iterBlk == iterBlkEnd
+          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+        // Just finish one scc.
+        ++sccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
+          if (DEBUGME) {
+            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
+                   << ", sccNumIter = " << sccNumIter;
+            errs() << "doesn't make any progress\n";
+          }
+          contNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
+          sccNumBlk = sccRemainedNumBlk;
+          iterBlk = sccBeginIter;
+          contNextScc = false;
+          if (DEBUGME) {
+            errs() << "repeat processing SCC" << getSCCNum(curBlk)
+                   << "sccNumIter = " << sccNumIter << "\n";
+            func.viewCFG();
+            //func.viewCFGOnly();
+          }
+        } else {
+          // Finish the current scc.
+          contNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        contNextScc = false;
+      }
+
+      if (contNextScc) {
+        sccBeginBlk = NULL;
+      }
+    } //while, "one iteration" over the function.
+
+    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
+    if (entryBlk->succ_size() == 0) {
+      finish = true;
+      if (DEBUGME) {
+        errs() << "Reduce to one block\n";
+      }
+    } else {
+      int newnumRemainedBlk
+        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      // consider cloned blocks ??
+      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
+        makeProgress = true;
+        numRemainedBlk = newnumRemainedBlk;
+      } else {
+        makeProgress = false;
+        if (DEBUGME) {
+          errs() << "No progress\n";
+        }
+      }
+    }
+  } while (!finish && makeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+
+  // Detach retired Block, release memory.
+  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
+       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    if ((*iterMap).second && (*iterMap).second->isRetired) {
+      assert(((*iterMap).first)->getNumber() != -1);
+      if (DEBUGME) {
+        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
+      }
+      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*iterMap).second;
+  }
+  blockInfoMap.clear();
+
+  // clear loopLandInfoMap
+  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
+       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    delete (*iterMap).second;
+  }
+  loopLandInfoMap.clear();
+
+  if (DEBUGME) {
+    func.viewCFG();
+    //func.dump();
+  }
+
+  if (!finish) {
+    assert(!"IRREDUCIBL_CF");
+  }
+
+  return true;
+} //CFGStructurizer::run
+
+/// Print the ordered Blocks.
+///
+template<class PassT>
+void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
+  size_t i = 0;
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
+       iterBlk != iterBlkEnd;
+       ++iterBlk, ++i) {
+    os << "BB" << (*iterBlk)->getNumber();
+    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+    if (i != 0 && i % 10 == 0) {
+      os << "\n";
+    } else {
+      os << " ";
+    }
+  }
+} //printOrderedBlocks
+
+/// Compute the reversed DFS post order of Blocks
+///
+template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
+  int sccNum = 0;
+  BlockT *bb;
+  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
+       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
+    std::vector<BlockT *> &sccNext = *sccIter;
+    for (typename std::vector<BlockT *>::const_iterator
+         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      bb = *blockIter;
+      orderedBlks.push_back(bb);
+      recordSccnum(bb, sccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
+       blockEnd1 = FuncGTraits::nodes_end(funcRep);
+       blockIter1 != blockEnd1; ++blockIter1) {
+    BlockT *bb = &(*blockIter1);
+    sccNum = getSCCNum(bb);
+    if (sccNum == INVALIDSCCNUM) {
+      errs() << "unreachable block BB" << bb->getNumber() << "\n";
+    }
+  } //end of for
+} //orderBlocks
+
+template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
+  int numMatch = 0;
+  int curMatch;
+
+  if (DEBUGME) {
+        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
+  }
+
+  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
+    numMatch += curMatch;
+  }
+
+  if (DEBUGME) {
+        errs() << "End patternMatch BB" << curBlk->getNumber()
+      << ", numMatch = " << numMatch << "\n";
+  }
+
+  return numMatch;
+} //patternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
+  int numMatch = 0;
+  numMatch += serialPatternMatch(curBlk);
+  numMatch += ifPatternMatch(curBlk);
+  //numMatch += switchPatternMatch(curBlk);
+  numMatch += loopendPatternMatch(curBlk);
+  numMatch += loopPatternMatch(curBlk);
+  return numMatch;
+}//patternMatchGroup
+
+template<class PassT>
+int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 1) {
+    return 0;
+  }
+
+  BlockT *childBlk = *curBlk->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+    return 0;
+  }
+
+  mergeSerialBlock(curBlk, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+} //serialPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+  //two edges
+  if (curBlk->succ_size() != 2) {
+    return 0;
+  }
+
+  if (hasBackEdge(curBlk)) {
+    return 0;
+  }
+
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
+  if (branchInstr == NULL) {
+    return 0;
+  }
+
+  assert(CFGTraits::isCondBranch(branchInstr));
+
+  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
+  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
+  BlockT *landBlk;
+  int cloned = 0;
+
+  // TODO: Simplify
+  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
+    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
+    landBlk = *trueBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
+    landBlk = NULL;
+  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
+    landBlk = falseBlk;
+    falseBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && *falseBlk->succ_begin() == trueBlk) {
+    landBlk = trueBlk;
+    trueBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
+    landBlk = *falseBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 1
+    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
+    landBlk = *trueBlk->succ_begin();
+  } else {
+    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (landBlk != NULL &&
+      ((trueBlk && trueBlk->pred_size() > 1)
+      || (falseBlk && falseBlk->pred_size() > 1))) {
+     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  }
+
+  if (trueBlk && trueBlk->pred_size() > 1) {
+    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
+    ++cloned;
+  }
+
+  if (falseBlk && falseBlk->pred_size() > 1) {
+    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
+    ++cloned;
+  }
+
+  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += cloned;
+
+  return 1 + cloned;
+} //ifPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
+  return 0;
+} //switchPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  typename std::vector<LoopT *> nestedLoops;
+  while (loopRep) {
+    nestedLoops.push_back(loopRep);
+    loopRep = loopRep->getParentLoop();
+  }
+
+  if (nestedLoops.size() == 0) {
+    return 0;
+  }
+
+  // Process nested loop outside->inside, so "continue" to a outside loop won't
+  // be mistaken as "break" of the current loop.
+  int num = 0;
+  for (typename std::vector<LoopT *>::reverse_iterator
+       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
+       iter != iterEnd; ++iter) {
+    loopRep = *iter;
+
+    if (getLoopLandBlock(loopRep) != NULL) {
+      continue;
+    }
+
+    BlockT *loopHeader = loopRep->getHeader();
+
+    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
+
+    if (numBreak == -1) {
+      break;
+    }
+
+    int numCont = loopcontPatternMatch(loopRep, loopHeader);
+    num += numBreak + numCont;
+  }
+
+  return num;
+} //loopendPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 0) {
+    return 0;
+  }
+
+  int numLoop = 0;
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+    if (loopLand) {
+      BlockT *landBlk = loopLand->landBlk;
+      assert(landBlk);
+      if (!isRetiredBlock(landBlk)) {
+        mergeLooplandBlock(curBlk, loopLand);
+        ++numLoop;
+      }
+    }
+    loopRep = loopRep->getParentLoop();
+  }
+
+  numLoopPatternMatch += numLoop;
+
+  return numLoop;
+} //loopPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
+                                                  BlockT *loopHeader) {
+  BlockTSmallerVector exitingBlks;
+  loopRep->getExitingBlocks(exitingBlks);
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
+  }
+
+  if (exitingBlks.size() == 0) {
+    setLoopLandBlock(loopRep);
+    return 0;
+  }
+
+  // Compute the corresponding exitBlks and exit block set.
+  BlockTSmallerVector exitBlks;
+  std::set<BlockT *> exitBlkSet;
+  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
+       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *exitingBlk = *iter;
+    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+    exitBlks.push_back(exitBlk);
+    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+  }
+
+  assert(exitBlkSet.size() > 0);
+  assert(exitBlks.size() == exitingBlks.size());
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+  }
+
+  // Find exitLandBlk.
+  BlockT *exitLandBlk = NULL;
+  int numCloned = 0;
+  int numSerial = 0;
+
+  if (exitBlkSet.size() == 1)
+  {
+    exitLandBlk = *exitBlkSet.begin();
+  } else {
+    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
+
+    if (exitLandBlk == NULL) {
+      return -1;
+    }
+
+    bool allInPath = true;
+    bool allNotInPath = true;
+    for (typename std::set<BlockT*>::const_iterator
+         iter = exitBlkSet.begin(),
+         iterEnd = exitBlkSet.end();
+         iter != iterEnd; ++iter) {
+      BlockT *exitBlk = *iter;
+
+      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
+      if (DEBUGME) {
+        errs() << "BB" << exitBlk->getNumber()
+               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
+               << pathKind << "\n";
+      }
+
+      allInPath = allInPath && (pathKind == SinglePath_InPath);
+      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
+
+      if (!allInPath && !allNotInPath) {
+        if (DEBUGME) {
+              errs() << "singlePath check fail\n";
+        }
+        return -1;
+      }
+    } // check all exit blocks
+
+    if (allNotInPath) {
+#if 1
+
+      // TODO: Simplify, maybe separate function?
+      //funcRep->viewCFG();
+      LoopT *parentLoopRep = loopRep->getParentLoop();
+      BlockT *parentLoopHeader = NULL;
+      if (parentLoopRep)
+        parentLoopHeader = parentLoopRep->getHeader();
+
+      if (exitLandBlk == parentLoopHeader &&
+          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
+                                               loopRep,
+                                               exitBlkSet,
+                                               exitLandBlk)) != NULL) {
+        if (DEBUGME) {
+          errs() << "relocateLoopcontBlock success\n";
+        }
+      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
+                                                      exitingBlks,
+                                                      exitBlks)) != NULL) {
+        if (DEBUGME) {
+          errs() << "insertEndbranchBlock success\n";
+        }
+      } else {
+        if (DEBUGME) {
+          errs() << "loop exit fail\n";
+        }
+        return -1;
+      }
+#else
+      return -1;
+#endif
+    }
+
+    // Handle side entry to exit path.
+    exitBlks.clear();
+    exitBlkSet.clear();
+    for (typename BlockTSmallerVector::iterator iterExiting =
+           exitingBlks.begin(),
+         iterExitingEnd = exitingBlks.end();
+         iterExiting != iterExitingEnd; ++iterExiting) {
+      BlockT *exitingBlk = *iterExiting;
+      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+      BlockT *newExitBlk = exitBlk;
+
+      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
+        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
+        ++numCloned;
+      }
+
+      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
+
+      exitBlks.push_back(newExitBlk);
+      exitBlkSet.insert(newExitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      numSerial += serialPatternMatch(exitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      if (exitBlk->pred_size() > 1) {
+        if (exitBlk != exitLandBlk) {
+          return -1;
+        }
+      } else {
+        if (exitBlk != exitLandBlk &&
+            (exitBlk->succ_size() != 1 ||
+            *exitBlk->succ_begin() != exitLandBlk)) {
+          return -1;
+        }
+      }
+    }
+  } // else
+
+  // LoopT *exitLandLoop = loopInfo->getLoopFor(exitLandBlk);
+  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
+
+  // Fold break into the breaking block. Leverage across level breaks.
+  assert(exitingBlks.size() == exitBlks.size());
+  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
+       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
+       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
+    BlockT *exitBlk = *iterExit;
+    BlockT *exitingBlk = *iterExiting;
+    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
+    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
+    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
+  }
+
+  int numBreak = static_cast<int>(exitingBlks.size());
+  numLoopbreakPatternMatch += numBreak;
+  numClonedBlock += numCloned;
+  return numBreak + numSerial + numCloned;
+} //loopbreakPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
+                                                 BlockT *loopHeader) {
+  int numCont = 0;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
+  for (typename InvBlockGTraits::ChildIteratorType iter =
+       InvBlockGTraits::child_begin(loopHeader),
+       iterEnd = InvBlockGTraits::child_end(loopHeader);
+       iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    if (loopRep->contains(curBlk)) {
+      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
+                          loopHeader, loopRep);
+      contBlk.push_back(curBlk);
+      ++numCont;
+    }
+  }
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
+       iter = contBlk.begin(), iterEnd = contBlk.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->removeSuccessor(loopHeader);
+  }
+
+  numLoopcontPatternMatch += numCont;
+
+  return numCont;
+} //loopcontPatternMatch
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
+                                                         BlockT *src2Blk) {
+  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
+  // same loop with LoopLandInfo without explicitly keeping track of
+  // loopContBlks and loopBreakBlks, this is a method to get the information.
+  //
+  if (src1Blk->succ_size() == 0) {
+    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
+    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
+      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+      if (theEntry != NULL) {
+        if (DEBUGME) {
+          errs() << "isLoopContBreakBlock yes src1 = BB"
+                 << src1Blk->getNumber()
+                 << " src2 = BB" << src2Blk->getNumber() << "\n";
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}  //isSameloopDetachedContbreak
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
+                                             BlockT *trueBlk,
+                                             BlockT *falseBlk) {
+  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
+  if (num == 0) {
+    if (DEBUGME) {
+      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    }
+    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
+  }
+  return num;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
+                                                BlockT *trueBlk,
+                                                BlockT *falseBlk) {
+  int num = 0;
+  BlockT *downBlk;
+
+  //trueBlk could be the common post dominator
+  downBlk = trueBlk;
+
+  if (DEBUGME) {
+    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
+           << " true = BB" << trueBlk->getNumber()
+           << ", numSucc=" << trueBlk->succ_size()
+           << " false = BB" << falseBlk->getNumber() << "\n";
+  }
+
+  while (downBlk) {
+    if (DEBUGME) {
+      errs() << "check down = BB" << downBlk->getNumber();
+    }
+
+    if (//postDomTree->dominates(downBlk, falseBlk) &&
+        singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
+      if (DEBUGME) {
+        errs() << " working\n";
+      }
+
+      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
+      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+
+      numClonedBlock += num;
+      num += serialPatternMatch(*headBlk->succ_begin());
+      num += serialPatternMatch(*(++headBlk->succ_begin()));
+      num += ifPatternMatch(headBlk);
+      assert(num > 0); //
+
+      break;
+    }
+    if (DEBUGME) {
+      errs() << " not working\n";
+    }
+    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+  } // walk down the postDomTree
+
+  return num;
+} //handleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
+                                                         BlockT *trueBlk,
+                                                         BlockT *falseBlk,
+                                                         BlockT *landBlk,
+                                                         bool detail) {
+  errs() << "head = BB" << headBlk->getNumber()
+         << " size = " << headBlk->size();
+  if (detail) {
+    errs() << "\n";
+    headBlk->print(errs());
+    errs() << "\n";
+  }
+
+  if (trueBlk) {
+    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
+           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      trueBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (falseBlk) {
+    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
+           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      falseBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (landBlk) {
+    errs() << ", land = BB" << landBlk->getNumber() << " size = "
+           << landBlk->size() << " numPred = " << landBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      landBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+
+    errs() << "\n";
+} //showImproveSimpleJumpintoIf
+
+template<class PassT>
+int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
+                                                    BlockT *trueBlk,
+                                                    BlockT *falseBlk,
+                                                    BlockT **plandBlk) {
+  bool migrateTrue = false;
+  bool migrateFalse = false;
+
+  BlockT *landBlk = *plandBlk;
+
+  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
+         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+
+  if (trueBlk == falseBlk) {
+    return 0;
+  }
+
+#if 0
+  if (DEBUGME) {
+    errs() << "improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+#endif
+
+  // unsigned landPredSize = landBlk ? landBlk->pred_size() : 0;
+  // May consider the # landBlk->pred_size() as it represents the number of
+  // assignment initReg = .. needed to insert.
+  migrateTrue = needMigrateBlock(trueBlk);
+  migrateFalse = needMigrateBlock(falseBlk);
+
+  if (!migrateTrue && !migrateFalse) {
+    return 0;
+  }
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
+    migrateTrue = true;
+  }
+  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
+    migrateFalse = true;
+  }
+
+  if (DEBUGME) {
+    errs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1);
+  }
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  unsigned initReg =
+    funcRep->getRegInfo().createVirtualRegister(I32RC);
+  if (!migrateTrue || !migrateFalse) {
+    int initVal = migrateTrue ? 0 : 1;
+    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  }
+
+  int numNewBlk = 0;
+
+  if (landBlk == NULL) {
+    landBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(landBlk);  //insert to function
+
+    if (trueBlk) {
+      trueBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    if (falseBlk) {
+      falseBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    numNewBlk ++;
+  }
+
+  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+
+  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+  typename BlockT::iterator insertPos =
+    CFGTraits::getInstrPos
+    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
+
+  if (landBlkHasOtherPred) {
+    unsigned immReg =
+      funcRep->getRegInfo().createVirtualRegister(I32RC);
+    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
+    unsigned cmpResReg =
+      funcRep->getRegInfo().createVirtualRegister(I32RC);
+
+    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
+                                        initReg, immReg);
+    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
+                                      AMDGPU::IF_LOGICALZ_i32, passRep,
+                                      cmpResReg, DebugLoc());
+  }
+
+  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_LOGICALNZ_i32,
+                                    passRep, initReg, DebugLoc());
+
+  if (migrateTrue) {
+    migrateInstruction(trueBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+  }
+  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
+
+  if (migrateFalse) {
+    migrateInstruction(falseBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+  }
+  //CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+
+  if (landBlkHasOtherPred) {
+    // add endif
+    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
+         ++predIter) {
+      BlockT *curBlk = *predIter;
+      if (curBlk != trueBlk && curBlk != falseBlk) {
+        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
+      }
+    } //for
+  }
+  if (DEBUGME) {
+    errs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1);
+  }
+
+  // update landBlk
+  *plandBlk = landBlk;
+
+  return numNewBlk;
+} //improveSimpleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
+                                              LoopT *exitingLoop,
+                                             BlockT *exitBlk,
+                                              LoopT *exitLoop,
+                                             BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
+           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
+  }
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  RegiT initReg = INVALIDREGNUM;
+  if (exitingLoop != exitLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    assert(initReg != INVALIDREGNUM);
+    addLoopBreakInitReg(exitLoop, initReg);
+    while (exitingLoop != exitLoop && exitingLoop) {
+      addLoopBreakOnReg(exitingLoop, initReg);
+      exitingLoop = exitingLoop->getParentLoop();
+    }
+    assert(exitingLoop == exitLoop);
+  }
+
+  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+
+} //handleLoopbreak
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
+                                                  LoopT *contingLoop,
+                                                 BlockT *contBlk,
+                                                  LoopT *contLoop) {
+  if (DEBUGME) {
+    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
+           << " header = BB" << contBlk->getNumber() << "\n";
+
+    errs() << "Trying to continue loop-depth = "
+           << getLoopDepth(contLoop)
+           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (contingLoop != contLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    assert(initReg != INVALIDREGNUM);
+    addLoopContInitReg(contLoop, initReg);
+    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
+      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
+      contingLoop = contingLoop->getParentLoop();
+    }
+    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
+    addLoopContOnReg(contingLoop, initReg);
+  }
+
+  settleLoopcontBlock(contingBlk, contBlk, initReg);
+  //contingBlk->removeSuccessor(loopHeader);
+} //handleLoopcontBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+    errs() << "serialPattern BB" << dstBlk->getNumber()
+           << " <= BB" << srcBlk->getNumber() << "\n";
+  }
+  //removeUnconditionalBranch(dstBlk);
+  dstBlk->splice(dstBlk->end(), srcBlk, FirstNonDebugInstr(srcBlk), srcBlk->end());
+
+  dstBlk->removeSuccessor(srcBlk);
+  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
+
+  removeSuccessor(srcBlk);
+  retireBlock(dstBlk, srcBlk);
+} //mergeSerialBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
+                                                  BlockT *curBlk,
+                                                  BlockT *trueBlk,
+                                                  BlockT *falseBlk,
+                                                  BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "ifPattern BB" << curBlk->getNumber();
+    errs() << "{  ";
+    if (trueBlk) {
+      errs() << "BB" << trueBlk->getNumber();
+    }
+    errs() << "  } else ";
+    errs() << "{  ";
+    if (falseBlk) {
+      errs() << "BB" << falseBlk->getNumber();
+    }
+    errs() << "  }\n ";
+    errs() << "landBlock: ";
+    if (landBlk == NULL) {
+      errs() << "NULL";
+    } else {
+      errs() << "BB" << landBlk->getNumber();
+    }
+    errs() << "\n";
+  }
+
+  int oldOpcode = branchInstr->getOpcode();
+  DebugLoc branchDL = branchInstr->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(curBlk, branchInstr);
+  CFGTraits::insertCondBranchBefore(branchInstrPos,
+                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
+                                    passRep,
+                                                                       branchDL);
+
+  if (trueBlk) {
+    curBlk->splice(branchInstrPos, trueBlk, FirstNonDebugInstr(trueBlk), trueBlk->end());
+    curBlk->removeSuccessor(trueBlk);
+    if (landBlk && trueBlk->succ_size()!=0) {
+      trueBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, trueBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
+
+  if (falseBlk) {
+    curBlk->splice(branchInstrPos, falseBlk, FirstNonDebugInstr(falseBlk),
+                   falseBlk->end());
+    curBlk->removeSuccessor(falseBlk);
+    if (landBlk && falseBlk->succ_size() != 0) {
+      falseBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, falseBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
+
+  //curBlk->remove(branchInstrPos);
+  branchInstr->eraseFromParent();
+
+  if (landBlk && trueBlk && falseBlk) {
+    curBlk->addSuccessor(landBlk);
+  }
+
+} //mergeIfthenelseBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
+                                                LoopLandInfo *loopLand) {
+  BlockT *landBlk = loopLand->landBlk;
+
+  if (DEBUGME) {
+    errs() << "loopPattern header = BB" << dstBlk->getNumber()
+           << " land = BB" << landBlk->getNumber() << "\n";
+  }
+
+  // Loop contInitRegs are init at the beginning of the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->contInitRegs.begin(),
+       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the
+   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
+   * search for the DebugLoc in the that statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
+  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
+  // Loop breakInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakInitRegs.begin(),
+       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter)
+  {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+  // Loop endbranchInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->endbranchInitRegs.begin(),
+       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
+   * search for the DebugLoc in the continue statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
+  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
+  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
+  // loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakOnRegs.begin(),
+       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::BREAK_LOGICALNZ_i32, passRep,
+                                   *iter);
+  }
+
+  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
+  // loop.
+  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
+       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
+                                   passRep, *iter);
+  }
+
+  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
+
+  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
+       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
+    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
+  }
+
+  removeSuccessor(landBlk);
+  retireBlock(dstBlk, landBlk);
+} //mergeLooplandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
+                                                 BlockT *exitBlk,
+                                                 BlockT *exitLandBlk,
+                                                 RegiT  setReg) {
+  if (DEBUGME) {
+    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
+           << " exit = BB" << exitBlk->getNumber()
+           << " land = BB" << exitLandBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
+  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+
+  DebugLoc DL = branchInstr->getDebugLoc();
+
+  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+  int oldOpcode = branchInstr->getOpcode();
+
+  //    transform exitingBlk to
+  //    if ( ) {
+  //       exitBlk (if exitBlk != exitLandBlk)
+  //       setReg = 1
+  //       break
+  //    }endif
+  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(exitingBlk, branchInstr);
+
+  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
+    //break_logical
+    int newOpcode =
+    (trueBranch == exitBlk) ? CFGTraits::getBreakNzeroOpcode(oldOpcode)
+                            : CFGTraits::getBreakZeroOpcode(oldOpcode);
+    CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL);
+  } else {
+    int newOpcode =
+    (trueBranch == exitBlk) ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                            : CFGTraits::getBranchZeroOpcode(oldOpcode);
+    CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL);
+    if (exitBlk != exitLandBlk) {
+      //splice is insert-before ...
+      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
+                         exitBlk->end());
+    }
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+    }
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
+  } //if_logical
+
+  //now branchInst can be erase safely
+  //exitingBlk->eraseFromParent(branchInstr);
+  branchInstr->eraseFromParent();
+
+  //now take care of successors, retire blocks
+  exitingBlk->removeSuccessor(exitBlk);
+  if (exitBlk != exitLandBlk) {
+    //splice is insert-before ...
+    exitBlk->removeSuccessor(exitLandBlk);
+    retireBlock(exitingBlk, exitBlk);
+  }
+
+} //mergeLoopbreakBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
+                                                 BlockT *contBlk,
+                                                 RegiT   setReg) {
+  if (DEBUGME) {
+    errs() << "settleLoopcontBlock conting = BB"
+           << contingBlk->getNumber()
+           << ", cont = BB" << contBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
+  if (branchInstr) {
+    assert(CFGTraits::isCondBranch(branchInstr));
+    typename BlockT::iterator branchInstrPos =
+      CFGTraits::getInstrPos(contingBlk, branchInstr);
+    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+    int oldOpcode = branchInstr->getOpcode();
+       DebugLoc DL = branchInstr->getDebugLoc();
+
+    //    transform contingBlk to
+    //     if () {
+    //          move instr after branchInstr
+    //          continue
+    //        or
+    //          setReg = 1
+    //          break
+    //     }endif
+    //     successor = {orgSuccessor(contingBlk) - loopHeader}
+
+    bool useContinueLogical = 
+      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
+
+    if (useContinueLogical == false) 
+    {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+
+      if (setReg != INVALIDREGNUM) {
+        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
+      } else {
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
+      }
+
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
+    } else {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
+                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+    }
+
+    //contingBlk->eraseFromParent(branchInstr);
+    branchInstr->eraseFromParent();
+  } else {
+    /* if we've arrived here then we've already erased the branch instruction
+        * travel back up the basic block to see the last reference of our debug location
+        * we've just inserted that reference here so it should be representative */
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    } else {
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    }
+  } //else
+
+} //settleLoopcontBlock
+
+// BBs in exitBlkSet are determined as in break-path for loopRep,
+// before we can put code for BBs as inside loop-body for loopRep
+// check whether those BBs are determined as cont-BB for parentLoopRep
+// earlier.
+// If so, generate a new BB newBlk
+//    (1) set newBlk common successor of BBs in exitBlkSet
+//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
+//    (3) generate continue-instr in newBlk
+//
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
+                                              LoopT *loopRep,
+                                              std::set<BlockT *> &exitBlkSet,
+                                              BlockT *exitLandBlk) {
+  std::set<BlockT *> endBlkSet;
+
+//  BlockT *parentLoopHead = parentLoopRep->getHeader();
+
+
+  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
+       iterEnd = exitBlkSet.end();
+       iter != iterEnd; ++iter) {
+    BlockT *exitBlk = *iter;
+    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
+
+    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
+      return NULL;
+
+    endBlkSet.insert(endBlk);
+  }
+
+  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(newBlk);  //insert to function
+  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
+  SHOWNEWBLK(newBlk, "New continue block: ");
+
+  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
+       iterEnd = endBlkSet.end();
+       iter != iterEnd; ++iter) {
+      BlockT *endBlk = *iter;
+      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
+      if (contInstr) {
+        contInstr->eraseFromParent();
+      }
+      endBlk->addSuccessor(newBlk);
+      if (DEBUGME) {
+        errs() << "Add new continue Block to BB"
+               << endBlk->getNumber() << " successors\n";
+      }
+  }
+
+  return newBlk;
+} //relocateLoopcontBlock
+
+
+// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
+// LoopLandBlock. This BB branch on the loop endBranchInit register to the
+// pathes corresponding to the loop exiting branches.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
+                                              BlockTSmallerVector &exitingBlks,
+                                              BlockTSmallerVector &exitBlks) {
+  const AMDILInstrInfo *tii =
+             static_cast<const AMDILInstrInfo *>(passRep->getTargetInstrInfo());
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  RegiT endBranchReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+  assert(endBranchReg >= 0);
+
+  // reg = 0 before entering the loop
+  addLoopEndbranchInitReg(loopRep, endBranchReg);
+
+  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
+  assert(numBlks >=2 && numBlks == exitBlks.size());
+
+  BlockT *preExitingBlk = exitingBlks[0];
+  BlockT *preExitBlk = exitBlks[0];
+  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(preBranchBlk);  //insert to function
+  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
+
+  BlockT *newLandBlk = preBranchBlk;
+
+      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
+        newLandBlk);
+  preExitingBlk->removeSuccessor(preExitBlk);
+  preExitingBlk->addSuccessor(newLandBlk);
+
+  //it is redundant to add reg = 0 to exitingBlks[0]
+
+  // For 1..n th exiting path (the last iteration handles two pathes) create the
+  // branch to the previous path and the current path.
+  for (uint32_t i = 1; i < numBlks; ++i) {
+    BlockT *curExitingBlk = exitingBlks[i];
+    BlockT *curExitBlk = exitBlks[i];
+    BlockT *curBranchBlk;
+
+    if (i == numBlks - 1) {
+      curBranchBlk = curExitBlk;
+    } else {
+      curBranchBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(curBranchBlk);  //insert to function
+      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
+    }
+
+    // Add reg = i to exitingBlks[i].
+    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
+                                       endBranchReg, i);
+
+    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
+    // (exitingBlks[i], newLandBlk).
+    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
+                                          newLandBlk);
+    curExitingBlk->removeSuccessor(curExitBlk);
+    curExitingBlk->addSuccessor(newLandBlk);
+
+    // add to preBranchBlk the branch instruction:
+    // if (endBranchReg == preVal)
+    //    preExitBlk
+    // else
+    //    curBranchBlk
+    //
+    // preValReg = i - 1
+
+  DebugLoc DL;
+  RegiT preValReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+
+  preBranchBlk->insert(preBranchBlk->begin(),
+                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
+                       i - 1));
+
+  // condResReg = (endBranchReg == preValReg)
+    RegiT condResReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
+      .addReg(endBranchReg).addReg(preValReg);
+
+    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
+      .addMBB(preExitBlk).addReg(condResReg);
+
+    preBranchBlk->addSuccessor(preExitBlk);
+    preBranchBlk->addSuccessor(curBranchBlk);
+
+    // Update preExitingBlk, preExitBlk, preBranchBlk.
+    preExitingBlk = curExitingBlk;
+    preExitBlk = curExitBlk;
+    preBranchBlk = curBranchBlk;
+
+  }  //end for 1 .. n blocks
+
+  return newLandBlk;
+} //addLoopEndbranchBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::PathToKind
+CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
+                                     bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return SinglePath_InPath;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == dstBlk) {
+      return SinglePath_InPath;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return Not_SinglePath;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return SinglePath_NotInPath;
+  }
+
+  return Not_SinglePath;
+} //singlePathTo
+
+// If there is a single path from srcBlk to dstBlk, return the last block before
+// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
+// last block in the path Otherwise, return NULL
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
+                                      bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return srcBlk;
+  }
+
+  if (srcBlk->succ_size() == 0) {
+    return srcBlk;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    BlockT *preBlk = srcBlk;
+
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == NULL) {
+      return preBlk;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return NULL;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return srcBlk;
+  }
+
+  return NULL;
+
+} //singlePathEnd
+
+template<class PassT>
+int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
+                                               BlockT *dstBlk) {
+  int cloned = 0;
+  assert(preBlk->isSuccessor(srcBlk));
+  while (srcBlk && srcBlk != dstBlk) {
+    assert(srcBlk->succ_size() == 1);
+    if (srcBlk->pred_size() > 1) {
+      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
+      ++cloned;
+    }
+
+    preBlk = srcBlk;
+    srcBlk = *srcBlk->succ_begin();
+  }
+
+  return cloned;
+} //cloneOnSideEntryTo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
+                                                 BlockT *predBlk) {
+  assert(predBlk->isSuccessor(curBlk) &&
+         "succBlk is not a prececessor of curBlk");
+
+  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
+  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  //srcBlk, oldBlk, newBlk
+
+  predBlk->removeSuccessor(curBlk);
+  predBlk->addSuccessor(cloneBlk);
+
+  // add all successor to cloneBlk
+  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+
+  numClonedInstr += curBlk->size();
+
+  if (DEBUGME) {
+    errs() << "Cloned block: " << "BB"
+           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+  }
+
+  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
+
+  return cloneBlk;
+} //cloneBlockForPredecessor
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
+                                               BlockT *exitingBlk) {
+  BlockT *exitBlk = NULL;
+
+  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
+       iterSuccEnd = exitingBlk->succ_end();
+       iterSucc != iterSuccEnd; ++iterSucc) {
+    BlockT *curBlk = *iterSucc;
+    if (!loopRep->contains(curBlk)) {
+      assert(exitBlk == NULL);
+      exitBlk = curBlk;
+    }
+  }
+
+  assert(exitBlk != NULL);
+
+  return exitBlk;
+} //exitingBlock2ExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
+                                                BlockT *dstBlk,
+                                                InstrIterator insertPos) {
+  InstrIterator spliceEnd;
+  //look for the input branchinstr, not the AMDIL branchinstr
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+  if (branchInstr == NULL) {
+    if (DEBUGME) {
+      errs() << "migrateInstruction don't see branch instr\n" ;
+    }
+    spliceEnd = srcBlk->end();
+  } else {
+    if (DEBUGME) {
+      errs() << "migrateInstruction see branch instr\n" ;
+      branchInstr->dump();
+    }
+    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
+  }
+  if (DEBUGME) {
+    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+
+  //splice insert before insertPos
+  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+
+  if (DEBUGME) {
+    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+} //migrateInstruction
+
+// normalizeInfiniteLoopExit change
+//   B1:
+//        uncond_br LoopHeader
+//
+// to
+//   B1:
+//        cond_br 1 LoopHeader dummyExit
+// and return the newly added dummy exit block
+// 
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
+  BlockT *loopHeader;
+  BlockT *loopLatch;
+  loopHeader = LoopRep->getHeader();
+  loopLatch = LoopRep->getLoopLatch();
+  BlockT *dummyExitBlk = NULL;
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (loopHeader!=NULL && loopLatch!=NULL) {
+    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
+    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
+      dummyExitBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(dummyExitBlk);  //insert to function
+      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+
+      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
+
+      typename BlockT::iterator insertPos =
+        CFGTraits::getInstrPos(loopLatch, branchInstr);
+      unsigned immReg =
+        funcRep->getRegInfo().createVirtualRegister(I32RC);
+      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
+      InstrT *newInstr = 
+        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
+      MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
+
+      SHOWNEWINSTR(newInstr);
+
+      branchInstr->eraseFromParent();
+      loopLatch->addSuccessor(dummyExitBlk);
+    }
+  }
+
+  return dummyExitBlk;
+} //normalizeInfiniteLoopExit
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
+  InstrT *branchInstr;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
+          && CFGTraits::isUncondBranch(branchInstr)) {
+    if (DEBUGME) {
+          errs() << "Removing unconditional branch instruction" ;
+      branchInstr->dump();
+    }
+    branchInstr->eraseFromParent();
+  }
+} //removeUnconditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
+  if (srcBlk->succ_size() == 2) {
+    BlockT *blk1 = *srcBlk->succ_begin();
+    BlockT *blk2 = *(++srcBlk->succ_begin());
+
+    if (blk1 == blk2) {
+      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+      if (DEBUGME) {
+        errs() << "Removing unneeded conditional branch instruction" ;
+        branchInstr->dump();
+      }
+      branchInstr->eraseFromParent();
+      SHOWNEWBLK(blk1, "Removing redundant successor");
+      srcBlk->removeSuccessor(blk1);
+    }
+  }
+} //removeRedundantConditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
+                                               DEFAULT_VEC_SLOTS> &retBlks) {
+  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(dummyExitBlk);  //insert to function
+  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
+         retBlks.begin(),
+       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
+    if (curInstr) {
+      curInstr->eraseFromParent();
+    }
+#if 0
+    if (curBlk->size()==0 && curBlk->pred_size() == 1) {
+      if (DEBUGME) {
+        errs() << "Replace empty block BB" <<  curBlk->getNumber()
+          << " with dummyExitBlock\n";
+      }
+      BlockT *predb = *curBlk->pred_begin();
+      predb->removeSuccessor(curBlk);
+      curBlk = predb;
+    } //handle empty curBlk
+#endif
+    curBlk->addSuccessor(dummyExitBlk);
+    if (DEBUGME) {
+      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
+             << " successors\n";
+    }
+  } //for
+
+  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
+} //addDummyExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
+  while (srcBlk->succ_size()) {
+    srcBlk->removeSuccessor(*srcBlk->succ_begin());
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->sccNum = sccNum;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
+  }
+
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->isRetired = true;
+  //int i = srcBlk->succ_size();
+  //int j = srcBlk->pred_size();
+  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+         && "can't retire block yet");
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return (srcBlkInfo && srcBlkInfo->isRetired);
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+
+    if(loopLand == NULL)
+      return true;
+
+    BlockT *landBlk = loopLand->landBlk;
+    assert(landBlk);
+    if (!isRetiredBlock(landBlk)) {
+      return true;
+    }
+
+    loopRep = loopRep->getParentLoop();
+  }
+
+  return false;
+} //isActiveLoophead
+
+template<class PassT>
+bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
+  const unsigned blockSizeThreshold = 30;
+  const unsigned cloneInstrThreshold = 100;
+
+  bool multiplePreds = blk && (blk->pred_size() > 1);
+
+  if(!multiplePreds)
+    return false;
+
+  unsigned blkSize = blk->size();
+  return ((blkSize > blockSizeThreshold)
+          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
+} //needMigrateBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
+                                            BlockTSmallerVector &exitBlks,
+                                            std::set<BlockT *> &exitBlkSet) {
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
+
+  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+       predIterEnd = landBlk->pred_end();
+       predIter != predIterEnd; ++predIter) {
+    BlockT *curBlk = *predIter;
+    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
+      inpathBlks.push_back(curBlk);
+    }
+  } //for
+
+  //if landBlk has predecessors that are not in the given loop,
+  //create a new block
+  BlockT *newLandBlk = landBlk;
+  if (inpathBlks.size() != landBlk->pred_size()) {
+    newLandBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(newLandBlk);  //insert to function
+    newLandBlk->addSuccessor(landBlk);
+    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
+         inpathBlks.begin(),
+         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
+      BlockT *curBlk = *iter;
+      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
+      //srcBlk, oldBlk, newBlk
+      curBlk->removeSuccessor(landBlk);
+      curBlk->addSuccessor(newLandBlk);
+    }
+    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
+      if (exitBlks[i] == landBlk) {
+        exitBlks[i] = newLandBlk;
+      }
+    }
+    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
+  }
+
+  setLoopLandBlock(loopRep, newLandBlk);
+
+  return newLandBlk;
+} // recordLoopbreakLand
+
+template<class PassT>
+void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  assert(theEntry->landBlk == NULL);
+
+  if (blk == NULL) {
+    blk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(blk);  //insert to function
+    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
+  }
+
+  theEntry->landBlk = blk;
+
+  if (DEBUGME) {
+    errs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << blk->getNumber() << "\n";
+  }
+} // setLoopLandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+
+  theEntry->breakOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->breakInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
+                                                     RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->endbranchInitRegs.insert(regNum);
+
+  if (DEBUGME)
+  {
+        errs() << "addLoopEndbranchInitReg loop-header = BB"
+      << loopRep->getHeader()->getNumber()
+      << "  regNum = " << regNum << "\n";
+  }
+} // addLoopEndbranchInitReg
+
+template<class PassT>
+typename CFGStructurizer<PassT>::LoopLandInfo *
+CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry;
+} // getLoopLandInfo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry ? theEntry->landBlk : NULL;
+} // getLoopLandBlock
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  if (loopRep == NULL)
+    return false;
+
+  BlockT *loopHeader = loopRep->getHeader();
+
+  return curBlk->isSuccessor(loopHeader);
+
+} //hasBackEdge
+
+template<class PassT>
+unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
+  return loopRep ? loopRep->getLoopDepth() : 0;
+} //getLoopDepth
+
+template<class PassT>
+int CFGStructurizer<PassT>::countActiveBlock
+(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
+ typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
+  int count = 0;
+  while (iterStart != iterEnd) {
+    if (!isRetiredBlock(*iterStart)) {
+      ++count;
+    }
+    ++iterStart;
+  }
+
+  return count;
+} //countActiveBlock
+
+// This is work around solution for findNearestCommonDominator not avaiable to
+// post dom a proper fix should go to Dominators.h.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT*
+CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+
+  if (postDomTree->dominates(blk1, blk2)) {
+    return blk1;
+  }
+  if (postDomTree->dominates(blk2, blk1)) {
+    return blk2;
+  }
+
+  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
+  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+
+  // Handle newly cloned node.
+  if (node1 == NULL && blk1->succ_size() == 1) {
+    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
+  }
+  if (node2 == NULL && blk2->succ_size() == 1) {
+    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
+  }
+
+  if (node1 == NULL || node2 == NULL) {
+    return NULL;
+  }
+
+  node1 = node1->getIDom();
+  while (node1) {
+    if (postDomTree->dominates(node1, node2)) {
+      return node1->getBlock();
+    }
+    node1 = node1->getIDom();
+  }
+
+  return NULL;
+}
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::findNearestCommonPostDom
+(typename std::set<BlockT *> &blks) {
+  BlockT *commonDom;
+  typename std::set<BlockT *>::const_iterator iter = blks.begin();
+  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
+  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
+    BlockT *curBlk = *iter;
+    if (curBlk != commonDom) {
+      commonDom = findNearestCommonPostDom(curBlk, commonDom);
+    }
+  }
+
+  if (DEBUGME) {
+    errs() << "Common post dominator for exit blocks is ";
+    if (commonDom) {
+          errs() << "BB" << commonDom->getNumber() << "\n";
+    } else {
+      errs() << "NULL\n";
+    }
+  }
+
+  return commonDom;
+} //findNearestCommonPostDom
+
+} //end namespace llvm
+
+//todo: move-end
+
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer for AMDIL
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGStructurizer : public MachineFunctionPass
+{
+public:
+  typedef MachineInstr              InstructionType;
+  typedef MachineFunction           FunctionType;
+  typedef MachineBasicBlock         BlockType;
+  typedef MachineLoopInfo           LoopinfoType;
+  typedef MachineDominatorTree      DominatortreeType;
+  typedef MachinePostDominatorTree  PostDominatortreeType;
+  typedef MachineDomTreeNode        DomTreeNodeType;
+  typedef MachineLoop               LoopType;
+
+protected:
+  TargetMachine &TM;
+  const TargetInstrInfo *TII;
+  const AMDILRegisterInfo *TRI;
+
+public:
+  AMDILCFGStructurizer(char &pid, TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  const TargetInstrInfo *getTargetInstrInfo() const;
+  //bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGStructurizer
+
+//char AMDILCFGStructurizer::ID = 0;
+} //end of namespace llvm
+AMDILCFGStructurizer::AMDILCFGStructurizer(char &pid, TargetMachine &tm
+                                           AMDIL_OPT_LEVEL_DECL)
+: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
+  TRI(static_cast<const AMDILRegisterInfo *>(tm.getRegisterInfo())
+  ) {
+}
+
+const TargetInstrInfo *AMDILCFGStructurizer::getTargetInstrInfo() const {
+  return TII;
+}
+//===----------------------------------------------------------------------===//
+//
+// CFGPrepare
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGPrepare : public AMDILCFGStructurizer
+{
+public:
+  static char ID;
+
+public:
+  AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGPrepare
+
+char AMDILCFGPrepare::ID = 0;
+} //end of namespace llvm
+
+AMDILCFGPrepare::AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : AMDILCFGStructurizer(ID, tm  AMDIL_OPT_LEVEL_VAR) 
+{
+}
+const char *AMDILCFGPrepare::getPassName() const {
+  return "AMD IL Control Flow Graph Preparation Pass";
+}
+
+void AMDILCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGPerform
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGPerform : public AMDILCFGStructurizer
+{
+public:
+  static char ID;
+
+public:
+  AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGPerform
+
+char AMDILCFGPerform::ID = 0;
+} //end of namespace llvm
+
+  AMDILCFGPerform::AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+: AMDILCFGStructurizer(ID, tm AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+const char *AMDILCFGPerform::getPassName() const {
+  return "AMD IL Control Flow Graph structurizer Pass";
+}
+
+void AMDILCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructTraits<AMDILCFGStructurizer>
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+// this class is tailor to the AMDIL backend
+template<>
+struct CFGStructTraits<AMDILCFGStructurizer>
+{
+  typedef int RegiT;
+
+  static int getBreakNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDGPU::BRANCH_COND, AMDGPU::BREAK_LOGICALNZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBreakZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDGPU::BRANCH_COND, AMDGPU::BREAK_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBranchNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDGPU::BRANCH_COND, AMDGPU::IF_LOGICALNZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBranchZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDGPU::BRANCH_COND, AMDGPU::IF_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueNzeroOpcode(int oldOpcode)
+  {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDGPU::BRANCH_COND, AMDGPU::CONTINUE_LOGICALNZ);
+      default:
+        assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDGPU::BRANCH_COND, AMDGPU::CONTINUE_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+// the explicitly represented branch target is the true branch target
+#define getExplicitBranch getTrueBranch
+#define setExplicitBranch setTrueBranch
+
+  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
+    return instr->getOperand(0).getMBB();
+  }
+
+  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
+    instr->getOperand(0).setMBB(blk);
+  }
+
+  static MachineBasicBlock *
+  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(blk->succ_size() == 2);
+    MachineBasicBlock *trueBranch = getTrueBranch(instr);
+    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
+    MachineBasicBlock::succ_iterator iterNext = iter;
+    ++iterNext;
+
+    return (*iter == trueBranch) ? *iterNext : *iter;
+  }
+
+  static bool isCondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      ExpandCaseToAllScalarTypes(AMDGPU::BRANCH_COND);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isUncondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+    case AMDGPU::BRANCH:
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
+    //get DebugLoc from the first MachineBasicBlock instruction with debug info
+    DebugLoc DL;
+       for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
+         MachineInstr *instr = &(*iter);
+         if (instr->getDebugLoc().isUnknown() == false) {
+           DL = instr->getDebugLoc();
+         }
+    }
+    return DL;
+  }
+
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    MachineInstr *instr = &*iter;
+    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
+      return instr;
+    }
+    return NULL;
+  }
+
+  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  //
+  // BB with backward-edge could have move instructions after the branch
+  // instruction.  Such move instruction "belong to" the loop backward-edge.
+  //
+  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
+    const AMDILInstrInfo * TII = static_cast<const AMDILInstrInfo *>(
+                                  blk->getParent()->getTarget().getInstrInfo());
+
+    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
+         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
+      // FIXME: Simplify
+      MachineInstr *instr = &*iter;
+      if (instr) {
+        if (isCondBranch(instr) || isUncondBranch(instr)) {
+          return instr;
+        } else if (!TII->isMov(instr->getOpcode())) {
+          break;
+        }
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::RETURN) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::CONTINUE) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if ((instr->getOpcode() == AMDGPU::BREAK_LOGICALNZ_i32) || (instr->getOpcode() == AMDGPU::BREAK_LOGICALZ_i32)) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static bool isReturnBlock(MachineBasicBlock *blk) {
+    MachineInstr *instr = getReturnInstr(blk);
+    bool isReturn = (blk->succ_size() == 0);
+    if (instr) {
+      assert(isReturn);
+    } else if (isReturn) {
+      if (DEBUGME) {
+        errs() << "BB" << blk->getNumber()
+               <<" is return block without RETURN instr\n";
+      }
+    }
+
+    return  isReturn;
+  }
+
+  static MachineBasicBlock::iterator
+  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(instr->getParent() == blk && "instruction doesn't belong to block");
+    MachineBasicBlock::iterator iter = blk->begin();
+    MachineBasicBlock::iterator iterEnd = blk->end();
+    while (&(*iter) != instr && iter != iterEnd) {
+      ++iter;
+    }
+
+    assert(iter != iterEnd);
+    return iter;
+  }//getInstrPos
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDILCFGStructurizer *passRep) {
+    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrBefore
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDILCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    MachineBasicBlock::iterator res;
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDILCFGStructurizer *passRep) {
+    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrEnd
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDILCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+   MachineInstr *newInstr = blk->getParent()
+      ->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    blk->push_back(newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+  } //insertInstrEnd
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
+                                         int newOpcode, 
+                                         AMDILCFGStructurizer *passRep) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DebugLoc());
+
+    blk->insert(instrPos, newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
+                                     int newOpcode,
+                                     AMDILCFGStructurizer *passRep,
+                                                                        DebugLoc DL) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DL);
+
+    blk->insert(instrPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
+                                         false);
+
+    SHOWNEWINSTR(newInstr);
+    //erase later oldInstr->eraseFromParent();
+  } //insertCondBranchBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock *blk,
+                                     MachineBasicBlock::iterator insertPos,
+                                     int newOpcode,
+                                     AMDILCFGStructurizer *passRep,
+                                     RegiT regNum,
+                                                                        DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    //insert before
+    blk->insert(insertPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchBefore
+
+  static void insertCondBranchEnd(MachineBasicBlock *blk,
+                                  int newOpcode,
+                                  AMDILCFGStructurizer *passRep,
+                                  RegiT regNum) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
+
+    blk->push_back(newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchEnd
+
+
+  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
+                                      AMDILCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const AMDILInstrInfo *tii =
+             static_cast<const AMDILInstrInfo *>(passRep->getTargetInstrInfo());
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
+                                                 regVal);
+    blk->insert(instrPos, newInstr);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertAssignInstrBefore
+
+  static void insertAssignInstrBefore(MachineBasicBlock *blk,
+                                      AMDILCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    const AMDILInstrInfo *tii =
+             static_cast<const AMDILInstrInfo *>(passRep->getTargetInstrInfo());
+
+    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
+                                                 regVal);
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+  } //insertInstrBefore
+
+  static void insertCompareInstrBefore(MachineBasicBlock *blk,
+                                       MachineBasicBlock::iterator instrPos,
+                                       AMDILCFGStructurizer *passRep,
+                                       RegiT dstReg, RegiT src1Reg,
+                                       RegiT src2Reg) {
+    const AMDILInstrInfo *tii =
+             static_cast<const AMDILInstrInfo *>(passRep->getTargetInstrInfo());
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
+
+    MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
+    MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
+
+    blk->insert(instrPos, newInstr);
+    SHOWNEWINSTR(newInstr);
+
+  } //insertCompareInstrBefore
+
+  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
+                                 MachineBasicBlock *srcBlk) {
+    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
+         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
+      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
+    }
+  } //cloneSuccessorList
+
+  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
+    MachineFunction *func = srcBlk->getParent();
+    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
+    func->push_back(newBlk);  //insert to function
+    //newBlk->setNumber(srcBlk->getNumber());
+    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
+         iterEnd = srcBlk->end();
+         iter != iterEnd; ++iter) {
+      MachineInstr *instr = func->CloneMachineInstr(iter);
+      newBlk->push_back(instr);
+    }
+    return newBlk;
+  }
+
+  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
+  //the AMDIL instruction is not recognized as terminator fix this and retire
+  //this routine
+  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
+                                         MachineBasicBlock *oldBlk,
+                                         MachineBasicBlock *newBlk) {
+    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
+    if (branchInstr && isCondBranch(branchInstr) &&
+        getExplicitBranch(branchInstr) == oldBlk) {
+      setExplicitBranch(branchInstr, newBlk);
+    }
+  }
+
+  static void wrapup(MachineBasicBlock *entryBlk) {
+    assert((!entryBlk->getParent()->getJumpTableInfo()
+            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
+           && "found a jump table");
+
+     //collect continue right before endloop
+     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
+     MachineBasicBlock::iterator pre = entryBlk->begin();
+     MachineBasicBlock::iterator iterEnd = entryBlk->end();
+     MachineBasicBlock::iterator iter = pre;
+     while (iter != iterEnd) {
+       if (pre->getOpcode() == AMDGPU::CONTINUE
+           && iter->getOpcode() == AMDGPU::ENDLOOP) {
+         contInstr.push_back(pre);
+       }
+       pre = iter;
+       ++iter;
+     } //end while
+
+     //delete continue right before endloop
+     for (unsigned i = 0; i < contInstr.size(); ++i) {
+        contInstr[i]->eraseFromParent();
+     }
+
+     // TODO to fix up jump table so later phase won't be confused.  if
+     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+     // there isn't such an interface yet.  alternatively, replace all the other
+     // blocks in the jump table with the entryBlk //}
+
+  } //wrapup
+
+  static MachineDominatorTree *getDominatorTree(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineDominatorTree>();
+  }
+
+  static MachinePostDominatorTree*
+  getPostDominatorTree(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachinePostDominatorTree>();
+  }
+
+  static MachineLoopInfo *getLoopInfo(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineLoopInfo>();
+  }
+}; // template class CFGStructTraits
+} //end of namespace llvm
+
+// createAMDILCFGPreparationPass- Returns a pass
+FunctionPass *llvm::createAMDILCFGPreparationPass(TargetMachine &tm
+                                                  AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILCFGPrepare(tm  AMDIL_OPT_LEVEL_VAR);
+}
+
+bool AMDILCFGPrepare::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().prepare(func,
+                                                                        *this,
+                                                                        TRI);
+}
+
+// createAMDILCFGStructurizerPass- Returns a pass
+FunctionPass *llvm::createAMDILCFGStructurizerPass(TargetMachine &tm
+                                                   AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILCFGPerform(tm  AMDIL_OPT_LEVEL_VAR);
+}
+
+bool AMDILCFGPerform::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().run(func,
+                                                                    *this,
+                                                                    TRI);
+}
+
+//end of file newline goes below
+
diff --git a/lib/Target/AMDGPU/AMDILCallingConv.td b/lib/Target/AMDGPU/AMDILCallingConv.td
new file mode 100644 (file)
index 0000000..371d02a
--- /dev/null
@@ -0,0 +1,42 @@
+//===- AMDILCallingConv.td - Calling Conventions AMDIL -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMDIL architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// AMDIL 32-bit C return-value convention.
+def RetCC_AMDIL32 : CallingConv<[
+ // Since IL has no return values, all values can be emulated on the stack
+ // The stack can then be mapped to a number of sequential virtual registers
+ // in IL
+
+ // Integer and FP scalar values get put on the stack at 16-byte alignment
+ // but with a size of 4 bytes
+ CCIfType<[i32, f32], CCAssignToReg<
+ [
+ R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20
+]> >, CCAssignToStack<16, 16>]>;
+
+// AMDIL 32-bit C Calling convention.
+def CC_AMDIL32 : CallingConv<[
+  // Since IL has parameter values, all values can be emulated on the stack
+ // The stack can then be mapped to a number of sequential virtual registers
+ // in IL
+ // Integer and FP scalar values get put on the stack at 16-byte alignment
+ // but with a size of 4 bytes
+ // Integer and FP scalar values get put on the stack at 16-byte alignment
+ // but with a size of 4 bytes
+ CCIfType<[i32, f32], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20
+]> >, CCAssignToStack<16, 16>]>;
diff --git a/lib/Target/AMDGPU/AMDILCodeEmitter.h b/lib/Target/AMDGPU/AMDILCodeEmitter.h
new file mode 100644 (file)
index 0000000..0c7ae59
--- /dev/null
@@ -0,0 +1,48 @@
+//===-- AMDILCodeEmitter.h - AMDIL Code Emitter interface -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDILCODEEMITTER_H
+#define AMDILCODEEMITTER_H
+
+namespace llvm {
+
+  class AMDILCodeEmitter {
+  public:
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+    virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                   const MachineOperand &MO) const { return 0; }
+    virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
+                                     unsigned OpNo) const {
+      return 0;
+    }
+    virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
+                                     unsigned OpNo) const {
+      return 0;
+    }
+    virtual uint64_t VOPPostEncode(const MachineInstr &MI,
+                                   uint64_t Value) const {
+      return Value;
+    }
+    virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
+                                      unsigned OpNo) const {
+      return 0;
+    }
+    virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo)
+                                                                     const {
+      return 0;
+    }
+  };
+
+} // End namespace llvm
+
+#endif // AMDILCODEEMITTER_H
diff --git a/lib/Target/AMDGPU/AMDILDevice.cpp b/lib/Target/AMDGPU/AMDILDevice.cpp
new file mode 100644 (file)
index 0000000..4294a8b
--- /dev/null
@@ -0,0 +1,137 @@
+//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevice.h"
+#include "AMDILSubtarget.h"
+
+using namespace llvm;
+// Default implementation for all of the classes.
+AMDILDevice::AMDILDevice(AMDILSubtarget *ST) : mSTM(ST)
+{
+  mHWBits.resize(AMDILDeviceInfo::MaxNumberCapabilities);
+  mSWBits.resize(AMDILDeviceInfo::MaxNumberCapabilities);
+  setCaps();
+  mDeviceFlag = OCL_DEVICE_ALL;
+}
+
+AMDILDevice::~AMDILDevice()
+{
+    mHWBits.clear();
+    mSWBits.clear();
+}
+
+size_t AMDILDevice::getMaxGDSSize() const
+{
+  return 0;
+}
+
+uint32_t 
+AMDILDevice::getDeviceFlag() const
+{
+  return mDeviceFlag;
+}
+
+size_t AMDILDevice::getMaxNumCBs() const
+{
+  if (usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    return HW_MAX_NUM_CB;
+  }
+
+  return 0;
+}
+
+size_t AMDILDevice::getMaxCBSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    return MAX_CB_SIZE;
+  }
+
+  return 0;
+}
+
+size_t AMDILDevice::getMaxScratchSize() const
+{
+  return 65536;
+}
+
+uint32_t AMDILDevice::getStackAlignment() const
+{
+  return 16;
+}
+
+void AMDILDevice::setCaps()
+{
+  mSWBits.set(AMDILDeviceInfo::HalfOps);
+  mSWBits.set(AMDILDeviceInfo::ByteOps);
+  mSWBits.set(AMDILDeviceInfo::ShortOps);
+  mSWBits.set(AMDILDeviceInfo::HW64BitDivMod);
+  if (mSTM->isOverride(AMDILDeviceInfo::NoInline)) {
+    mSWBits.set(AMDILDeviceInfo::NoInline);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::MacroDB)) {
+    mSWBits.set(AMDILDeviceInfo::MacroDB);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::Debug)) {
+    mSWBits.set(AMDILDeviceInfo::ConstantMem);
+  } else {
+    mHWBits.set(AMDILDeviceInfo::ConstantMem);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::Debug)) {
+    mSWBits.set(AMDILDeviceInfo::PrivateMem);
+  } else {
+    mHWBits.set(AMDILDeviceInfo::PrivateMem);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::BarrierDetect)) {
+    mSWBits.set(AMDILDeviceInfo::BarrierDetect);
+  }
+  mSWBits.set(AMDILDeviceInfo::ByteLDSOps);
+  mSWBits.set(AMDILDeviceInfo::LongOps);
+}
+
+AMDILDeviceInfo::ExecutionMode
+AMDILDevice::getExecutionMode(AMDILDeviceInfo::Caps Caps) const
+{
+  if (mHWBits[Caps]) {
+    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDILDeviceInfo::Hardware;
+  }
+
+  if (mSWBits[Caps]) {
+    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDILDeviceInfo::Software;
+  }
+
+  return AMDILDeviceInfo::Unsupported;
+
+}
+
+bool AMDILDevice::isSupported(AMDILDeviceInfo::Caps Mode) const
+{
+  return getExecutionMode(Mode) != AMDILDeviceInfo::Unsupported;
+}
+
+bool AMDILDevice::usesHardware(AMDILDeviceInfo::Caps Mode) const
+{
+  return getExecutionMode(Mode) == AMDILDeviceInfo::Hardware;
+}
+
+bool AMDILDevice::usesSoftware(AMDILDeviceInfo::Caps Mode) const
+{
+  return getExecutionMode(Mode) == AMDILDeviceInfo::Software;
+}
+
+std::string
+AMDILDevice::getDataLayout() const
+{
+    return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+      "-n8:16:32:64");
+}
diff --git a/lib/Target/AMDGPU/AMDILDevice.h b/lib/Target/AMDGPU/AMDILDevice.h
new file mode 100644 (file)
index 0000000..706dd82
--- /dev/null
@@ -0,0 +1,116 @@
+//===---- AMDILDevice.h - Define Device Data for AMDIL -----*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===----------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILDEVICEIMPL_H_
+#define _AMDILDEVICEIMPL_H_
+#include "AMDIL.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+  class MCStreamer;
+//===----------------------------------------------------------------------===//
+// Interface for data that is specific to a single device
+//===----------------------------------------------------------------------===//
+class AMDILDevice {
+public:
+  AMDILDevice(AMDILSubtarget *ST);
+  virtual ~AMDILDevice();
+
+  // Enum values for the various memory types.
+  enum {
+    RAW_UAV_ID   = 0,
+    ARENA_UAV_ID = 1,
+    LDS_ID       = 2,
+    GDS_ID       = 3,
+    SCRATCH_ID   = 4,
+    CONSTANT_ID  = 5,
+    GLOBAL_ID    = 6,
+    MAX_IDS      = 7
+  } IO_TYPE_IDS;
+
+  // Returns the max LDS size that the hardware supports.  Size is in
+  // bytes.
+  virtual size_t getMaxLDSSize() const = 0;
+
+  // Returns the max GDS size that the hardware supports if the GDS is
+  // supported by the hardware.  Size is in bytes.
+  virtual size_t getMaxGDSSize() const;
+
+  // Returns the max number of hardware constant address spaces that
+  // are supported by this device.
+  virtual size_t getMaxNumCBs() const;
+
+  // Returns the max number of bytes a single hardware constant buffer
+  // can support.  Size is in bytes.
+  virtual size_t getMaxCBSize() const;
+
+  // Returns the max number of bytes allowed by the hardware scratch
+  // buffer.  Size is in bytes.
+  virtual size_t getMaxScratchSize() const;
+
+  // Get the flag that corresponds to the device.
+  virtual uint32_t getDeviceFlag() const;
+
+  // Returns the number of work-items that exist in a single hardware
+  // wavefront.
+  virtual size_t getWavefrontSize() const = 0;
+
+  // Get the generational name of this specific device.
+  virtual uint32_t getGeneration() const = 0;
+
+  // Get the stack alignment of this specific device.
+  virtual uint32_t getStackAlignment() const;
+
+  // Get the resource ID for this specific device.
+  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
+
+  // Get the max number of UAV's for this device.
+  virtual uint32_t getMaxNumUAVs() const = 0;
+
+
+  // API utilizing more detailed capabilities of each family of
+  // cards. If a capability is supported, then either usesHardware or
+  // usesSoftware returned true.  If usesHardware returned true, then
+  // usesSoftware must return false for the same capability.  Hardware
+  // execution means that the feature is done natively by the hardware
+  // and is not emulated by the softare.  Software execution means
+  // that the feature could be done in the hardware, but there is
+  // software that emulates it with possibly using the hardware for
+  // support since the hardware does not fully comply with OpenCL
+  // specs.
+  bool isSupported(AMDILDeviceInfo::Caps Mode) const;
+  bool usesHardware(AMDILDeviceInfo::Caps Mode) const;
+  bool usesSoftware(AMDILDeviceInfo::Caps Mode) const;
+  virtual std::string getDataLayout() const;
+  static const unsigned int MAX_LDS_SIZE_700 = 16384;
+  static const unsigned int MAX_LDS_SIZE_800 = 32768;
+  static const unsigned int WavefrontSize = 64;
+  static const unsigned int HalfWavefrontSize = 32;
+  static const unsigned int QuarterWavefrontSize = 16;
+protected:
+  virtual void setCaps();
+  llvm::BitVector mHWBits;
+  llvm::BitVector mSWBits;
+  AMDILSubtarget *mSTM;
+  uint32_t mDeviceFlag;
+private:
+  AMDILDeviceInfo::ExecutionMode
+  getExecutionMode(AMDILDeviceInfo::Caps Caps) const;
+}; // AMDILDevice
+
+} // namespace llvm
+#endif // _AMDILDEVICEIMPL_H_
diff --git a/lib/Target/AMDGPU/AMDILDeviceInfo.cpp b/lib/Target/AMDGPU/AMDILDeviceInfo.cpp
new file mode 100644 (file)
index 0000000..cbf5b51
--- /dev/null
@@ -0,0 +1,93 @@
+//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Function that creates DeviceInfo from a device name and other information.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevices.h"
+#include "AMDILSubtarget.h"
+
+using namespace llvm;
+namespace llvm {
+namespace AMDILDeviceInfo {
+    AMDILDevice*
+getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64bit, bool is64on32bit)
+{
+    if (deviceName.c_str()[2] == '7') {
+        switch (deviceName.c_str()[3]) {
+            case '1':
+                return new AMDIL710Device(ptr);
+            case '7':
+                return new AMDIL770Device(ptr);
+            default:
+                return new AMDIL7XXDevice(ptr);
+        };
+    } else if (deviceName == "cypress") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILCypressDevice(ptr);
+    } else if (deviceName == "juniper") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILEvergreenDevice(ptr);
+    } else if (deviceName == "redwood") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+      return new AMDILRedwoodDevice(ptr);
+    } else if (deviceName == "cedar") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILCedarDevice(ptr);
+    } else if (deviceName == "barts"
+      || deviceName == "turks") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILNIDevice(ptr);
+    } else if (deviceName == "cayman") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILCaymanDevice(ptr);
+    } else if (deviceName == "caicos") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILNIDevice(ptr);
+    } else if (deviceName == "SI") {
+        return new AMDILSIDevice(ptr);
+    } else {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDIL7XXDevice(ptr);
+    }
+}
+} // End namespace AMDILDeviceInfo
+} // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDILDeviceInfo.h b/lib/Target/AMDGPU/AMDILDeviceInfo.h
new file mode 100644 (file)
index 0000000..06ac432
--- /dev/null
@@ -0,0 +1,89 @@
+//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef _AMDILDEVICEINFO_H_
+#define _AMDILDEVICEINFO_H_
+
+
+#include <string>
+
+namespace llvm
+{
+  class AMDILDevice;
+  class AMDILSubtarget;
+  namespace AMDILDeviceInfo
+  {
+    // Each Capabilities can be executed using a hardware instruction,
+    // emulated with a sequence of software instructions, or not
+    // supported at all.
+    enum ExecutionMode {
+      Unsupported = 0, // Unsupported feature on the card(Default value)
+      Software, // This is the execution mode that is set if the
+      // feature is emulated in software
+      Hardware  // This execution mode is set if the feature exists
+        // natively in hardware
+    };
+
+    // Any changes to this needs to have a corresponding update to the
+    // twiki page GPUMetadataABI
+    enum Caps {
+      HalfOps          = 0x1,  // Half float is supported or not.
+      DoubleOps        = 0x2,  // Double is supported or not.
+      ByteOps          = 0x3,  // Byte(char) is support or not.
+      ShortOps         = 0x4,  // Short is supported or not.
+      LongOps          = 0x5,  // Long is supported or not.
+      Images           = 0x6,  // Images are supported or not.
+      ByteStores       = 0x7,  // ByteStores available(!HD4XXX).
+      ConstantMem      = 0x8,  // Constant/CB memory.
+      LocalMem         = 0x9,  // Local/LDS memory.
+      PrivateMem       = 0xA,  // Scratch/Private/Stack memory.
+      RegionMem        = 0xB,  // OCL GDS Memory Extension.
+      FMA              = 0xC,  // Use HW FMA or SW FMA.
+      ArenaSegment     = 0xD,  // Use for Arena UAV per pointer 12-1023.
+      MultiUAV         = 0xE,  // Use for UAV per Pointer 0-7.
+      Reserved0        = 0xF,  // ReservedFlag
+      NoAlias          = 0x10, // Cached loads.
+      Signed24BitOps   = 0x11, // Peephole Optimization.
+      // Debug mode implies that no hardware features or optimizations
+      // are performned and that all memory access go through a single
+      // uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
+      Debug            = 0x12, // Debug mode is enabled.
+      CachedMem        = 0x13, // Cached mem is available or not.
+      BarrierDetect    = 0x14, // Detect duplicate barriers.
+      Reserved1        = 0x15, // Reserved flag
+      ByteLDSOps       = 0x16, // Flag to specify if byte LDS ops are available.
+      ArenaVectors     = 0x17, // Flag to specify if vector loads from arena work.
+      TmrReg           = 0x18, // Flag to specify if Tmr register is supported.
+      NoInline         = 0x19, // Flag to specify that no inlining should occur.
+      MacroDB          = 0x1A, // Flag to specify that backend handles macrodb.
+      HW64BitDivMod    = 0x1B, // Flag for backend to generate 64bit div/mod.
+      ArenaUAV         = 0x1C, // Flag to specify that arena uav is supported.
+      PrivateUAV       = 0x1D, // Flag to specify that private memory uses uav's.
+      // If more capabilities are required, then
+      // this number needs to be increased.
+      // All capabilities must come before this
+      // number.
+      MaxNumberCapabilities = 0x20
+    };
+    // These have to be in order with the older generations
+    // having the lower number enumerations.
+    enum Generation {
+      HD4XXX = 0, // 7XX based devices.
+      HD5XXX, // Evergreen based devices.
+      HD6XXX, // NI/Evergreen+ based devices.
+      HD7XXX,
+      HDTEST, // Experimental feature testing device.
+      HDNUMGEN
+    };
+
+
+  AMDILDevice*
+    getDeviceFromName(const std::string &name, AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false);
+  } // namespace AMDILDeviceInfo
+} // namespace llvm
+#endif // _AMDILDEVICEINFO_H_
diff --git a/lib/Target/AMDGPU/AMDILDevices.h b/lib/Target/AMDGPU/AMDILDevices.h
new file mode 100644 (file)
index 0000000..cfcc330
--- /dev/null
@@ -0,0 +1,19 @@
+//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef __AMDIL_DEVICES_H_
+#define __AMDIL_DEVICES_H_
+// Include all of the device specific header files
+// This file is for Internal use only!
+#include "AMDIL7XXDevice.h"
+#include "AMDILDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSIDevice.h"
+
+#endif // _AMDIL_DEVICES_H_
diff --git a/lib/Target/AMDGPU/AMDILEnumeratedTypes.td b/lib/Target/AMDGPU/AMDILEnumeratedTypes.td
new file mode 100644 (file)
index 0000000..f10936b
--- /dev/null
@@ -0,0 +1,522 @@
+//===-- AMDILEnumeratedTypes.td - IL Type definitions --*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// ILEnumreatedTypes.td - The IL Enumerated Types
+//===--------------------------------------------------------------------===//
+
+// Section 5.1  IL Shader
+class ILShader<bits<8> val> {
+    bits<8> Value = val;
+}
+// Table 5-1
+def IL_SHADER_PIXEL : ILShader<0>;
+def IL_SHADER_COMPUTE : ILShader<1>;
+
+// Section 5.2 IL RegType
+class ILRegType<bits<6> val> {
+    bits<6> Value = val;
+}
+// Table 5-2
+def IL_REGTYPE_TEMP      : ILRegType<0>;
+def IL_REGTYPE_WINCOORD  : ILRegType<1>;
+def IL_REGTYPE_CONST_BUF : ILRegType<2>;
+def IL_REGTYPE_LITERAL   : ILRegType<3>;
+def IL_REGTYPE_ITEMP     : ILRegType<4>;
+def IL_REGTYPE_GLOBAL    : ILRegType<5>;
+
+// Section 5.3 IL Component Select
+class ILComponentSelect<bits<3> val, string text> {
+     bits<3> Value = val;
+     string Text = text;
+}
+// Table 5-3
+def IL_COMPSEL_X : ILComponentSelect<0, "x">;
+def IL_COMPSEL_Y : ILComponentSelect<1, "y">;
+def IL_COMPSEL_Z : ILComponentSelect<2, "z">;
+def IL_COMPSEL_W : ILComponentSelect<3, "w">;
+def IL_COMPSEL_0 : ILComponentSelect<4, "0">;
+def IL_COMPSEL_1 : ILComponentSelect<5, "1">;
+
+// Section 5.4 IL Mod Dst Comp
+class ILModDstComp<bits<2> val, string text> {
+    bits<2> Value = val;
+    string Text = text;
+}
+// Table 5-4
+def IL_MODCOMP_NOWRITE : ILModDstComp<0, "_">;
+def IL_MODCOMP_WRITE_X : ILModDstComp<1, "x">;
+def IL_MODCOMP_WRITE_y : ILModDstComp<1, "y">;
+def IL_MODCOMP_WRITE_z : ILModDstComp<1, "z">;
+def IL_MODCOMP_WRITE_w : ILModDstComp<1, "w">;
+def IL_MODCOMP_0       : ILModDstComp<2, "0">;
+def IL_MODCOMP_1       : ILModDstComp<3, "1">;
+
+// Section 5.5 IL Import Usage
+class ILImportUsage<bits<1> val, string usage> {
+    bits<1> Value = val;
+    string Text = usage;
+}
+// Table 5-5
+def IL_IMPORTUSAGE_WINCOORD : ILImportUsage<0, "_usage(wincoord)">;
+
+// Section 5.6 Il Shift Scale
+class ILShiftScale<bits<4> val, string scale> {
+    bits<4> Value = val;
+    string Text = scale;
+}
+
+// Table 5-6
+def IL_SHIFT_NONE   : ILShiftScale<0, "">;
+def IL_SHIFT_X2     : ILShiftScale<1, "_x2">;
+def IL_SHIFT_X4     : ILShiftScale<2, "_x4">;
+def IL_SHIFT_X8     : ILShiftScale<3, "_x8">;
+def IL_SHIFT_D2     : ILShiftScale<4, "_d2">;
+def IL_SHIFT_D4     : ILShiftScale<5, "_d4">;
+def IL_SHIFT_D8     : ILShiftScale<6, "_d8">;
+
+// Section 5.7 IL Divide Component
+class ILDivComp<bits<3> val, string divcomp> {
+    bits<3> Value = val;
+    string Text = divcomp;
+}
+
+// Table 5-7
+def IL_DIVCOMP_NONE : ILDivComp<0, "_divcomp(none)">;
+def IL_DIVCOMP_Y    : ILDivComp<1, "_divcomp(y)">;
+def IL_DIVCOMP_Z    : ILDivComp<2, "_divcomp(z)">;
+def IL_DIVCOMP_W    : ILDivComp<3, "_divcomp(w)">;
+//def IL_DIVCOMP_UNKNOWN : ILDivComp<4, "_divcomp(unknown)">;
+
+// Section 5.8 IL Relational Op
+class ILRelOp<bits<3> val, string op> {
+    bits<3> Value = val;
+    string Text = op;
+}
+
+// Table 5-8
+def IL_RELOP_EQ : ILRelOp<0, "_relop(eq)">;
+def IL_RELOP_NE : ILRelOp<1, "_relop(ne)">;
+def IL_RELOP_GT : ILRelOp<2, "_relop(gt)">;
+def IL_RELOP_GE : ILRelOp<3, "_relop(ge)">;
+def IL_RELOP_LT : ILRelOp<4, "_relop(lt)">;
+def IL_RELOP_LE : ILRelOp<5, "_relop(le)">;
+
+// Section 5.9 IL Zero Op
+class ILZeroOp<bits<3> val, string behavior> {
+    bits<3> Value = val;
+    string Text = behavior;
+}
+
+// Table 5-9
+def IL_ZEROOP_FLTMAX    : ILZeroOp<0, "_zeroop(fltmax)">;
+def IL_ZEROOP_0         : ILZeroOp<1, "_zeroop(zero)">;
+def IL_ZEROOP_INFINITY  : ILZeroOp<2, "_zeroop(infinity)">;
+def IL_ZEROOP_INF_ELSE_MAX : ILZeroOp<3, "_zeroop(inf_else_max)">;
+
+// Section 5.10 IL Cmp Value
+class ILCmpValue<bits<3> val, string num> {
+    bits<3> Value = val;
+    string Text = num;
+}
+
+// Table 5-10
+def IL_CMPVAL_0_0     : ILCmpValue<0, "0.0">;
+def IL_CMPVAL_0_5     : ILCmpValue<1, "0.5">;
+def IL_CMPVAL_1_0     : ILCmpValue<2, "1.0">;
+def IL_CMPVAL_NEG_0_5 : ILCmpValue<3, "-0.5">;
+def IL_CMPVAL_NEG_1_0 : ILCmpValue<4, "-1.0">;
+
+// Section 5.11 IL Addressing
+class ILAddressing<bits<3> val> {
+    bits<3> Value = val;
+}
+
+// Table 5-11
+def IL_ADDR_ABSOLUTE     : ILAddressing<0>;
+def IL_ADDR_RELATIVE     : ILAddressing<1>;
+def IL_ADDR_REG_RELATIVE : ILAddressing<2>;
+
+// Section 5.11 IL Element Format
+class ILElementFormat<bits<5> val> {
+    bits<5> Value = val;
+}
+
+// Table 5-11
+def IL_ELEMENTFORMAT_UNKNOWN : ILElementFormat<0>;
+def IL_ELEMENTFORMAT_SNORM   : ILElementFormat<1>;
+def IL_ELEMENTFORMAT_UNORM   : ILElementFormat<2>;
+def IL_ELEMENTFORMAT_SINT    : ILElementFormat<3>;
+def IL_ELEMENTFORMAT_UINT    : ILElementFormat<4>;
+def IL_ELEMENTFORMAT_FLOAT   : ILElementFormat<5>;
+def IL_ELEMENTFORMAT_SRGB    : ILElementFormat<6>;
+def IL_ELEMENTFORMAT_MIXED   : ILElementFormat<7>;
+def IL_ELEMENTFORMAT_Last    : ILElementFormat<8>;
+
+// Section 5.12 IL Op Code
+class ILOpCode<bits<16> val = -1, string cmd> {
+    bits<16> Value = val;
+    string Text = cmd;
+}
+
+// Table 5-12
+def IL_DCL_CONST_BUFFER         : ILOpCode<0, "dcl_cb">;
+def IL_DCL_INDEXED_TEMP_ARRAY   : ILOpCode<1, "dcl_index_temp_array">;
+def IL_DCL_INPUT                : ILOpCode<2, "dcl_input">;
+def IL_DCL_LITERAL              : ILOpCode<3, "dcl_literal">;
+def IL_DCL_OUTPUT               : ILOpCode<4, "dcl_output">;
+def IL_DCL_RESOURCE             : ILOpCode<5, "dcl_resource">;
+def IL_OP_ABS                   : ILOpCode<6, "abs">;
+def IL_OP_ADD                   : ILOpCode<7, "add">;
+def IL_OP_AND                   : ILOpCode<8, "iand">;
+def IL_OP_BREAK                 : ILOpCode<9, "break">;
+def IL_OP_BREAK_LOGICALNZ       : ILOpCode<10, "break_logicalnz">;
+def IL_OP_BREAK_LOGICALZ        : ILOpCode<11, "break_logicalz">;
+def IL_OP_BREAKC                : ILOpCode<12, "breakc">;
+def IL_OP_CALL                  : ILOpCode<13, "call">;
+def IL_OP_CALL_LOGICALNZ        : ILOpCode<14, "call_logicalnz">;
+def IL_OP_CALL_LOGICALZ         : ILOpCode<15, "call_logicalz">;
+def IL_OP_CASE                  : ILOpCode<16, "case">;
+def IL_OP_CLG                   : ILOpCode<17, "clg">;
+def IL_OP_CMOV                  : ILOpCode<18, "cmov">;
+def IL_OP_CMOV_LOGICAL          : ILOpCode<19, "cmov_logical">;
+def IL_OP_CMP                   : ILOpCode<20, "cmp">;
+def IL_OP_CONTINUE              : ILOpCode<21, "continue">;
+def IL_OP_CONTINUE_LOGICALNZ    : ILOpCode<22, "continue_logicalnz">;
+def IL_OP_CONTINUE_LOGICALZ     : ILOpCode<23, "continue_logicalz">;
+def IL_OP_CONTINUEC             : ILOpCode<24, "continuec">;
+def IL_OP_COS                   : ILOpCode<25, "cos">;
+def IL_OP_COS_VEC               : ILOpCode<26, "cos_vec">;
+def IL_OP_D_2_F                 : ILOpCode<27, "d2f">;
+def IL_OP_D_ADD                 : ILOpCode<28, "dadd">;
+def IL_OP_D_EQ                  : ILOpCode<29, "deq">;
+def IL_OP_D_FRC                 : ILOpCode<30, "dfrac">;
+def IL_OP_D_FREXP               : ILOpCode<31, "dfrexp">;
+def IL_OP_D_GE                  : ILOpCode<32, "dge">;
+def IL_OP_D_LDEXP               : ILOpCode<33, "dldexp">;
+def IL_OP_D_LT                  : ILOpCode<34, "dlt">;
+def IL_OP_D_MAD                 : ILOpCode<35, "dmad">;
+def IL_OP_D_MUL                 : ILOpCode<36, "dmul">;
+def IL_OP_D_NE                  : ILOpCode<37, "dne">;
+def IL_OP_DEFAULT               : ILOpCode<38, "default">;
+def IL_OP_DISCARD_LOGICALNZ     : ILOpCode<39, "discard_logicalnz">;
+def IL_OP_DISCARD_LOGICALZ      : ILOpCode<40, "discard_logicalz">;
+def IL_OP_DIV                   : ILOpCode<41, "div_zeroop(infinity)">;
+def IL_OP_DP2                   : ILOpCode<42, "dp2">;
+def IL_OP_DP3                   : ILOpCode<43, "dp3">;
+def IL_OP_DP4                   : ILOpCode<44, "dp4">;
+def IL_OP_ELSE                  : ILOpCode<45, "else">;
+def IL_OP_END                   : ILOpCode<46, "end">;
+def IL_OP_ENDFUNC               : ILOpCode<47, "endfunc">;
+def IL_OP_ENDIF                 : ILOpCode<48, "endif">;
+def IL_OP_ENDLOOP               : ILOpCode<49, "endloop">;
+def IL_OP_ENDMAIN               : ILOpCode<50, "endmain">;
+def IL_OP_ENDSWITCH             : ILOpCode<51, "endswitch">;
+def IL_OP_EQ                    : ILOpCode<52, "eq">;
+def IL_OP_EXP                   : ILOpCode<53, "exp">;
+def IL_OP_EXP_VEC               : ILOpCode<54, "exp_vec">;
+def IL_OP_F_2_D                 : ILOpCode<55, "f2d">;
+def IL_OP_FLR                   : ILOpCode<56, "flr">;
+def IL_OP_FRC                   : ILOpCode<57, "frc">;
+def IL_OP_FTOI                  : ILOpCode<58, "ftoi">;
+def IL_OP_FTOU                  : ILOpCode<59, "ftou">;
+def IL_OP_FUNC                  : ILOpCode<60, "func">;
+def IL_OP_GE                    : ILOpCode<61, "ge">;
+def IL_OP_I_ADD                 : ILOpCode<62, "iadd">;
+def IL_OP_I_EQ                  : ILOpCode<63, "ieq">;
+def IL_OP_I_GE                  : ILOpCode<64, "ige">;
+def IL_OP_I_LT                  : ILOpCode<65, "ilt">;
+def IL_OP_I_MAD                 : ILOpCode<66, "imad">;
+def IL_OP_I_MAX                 : ILOpCode<67, "imax">;
+def IL_OP_I_MIN                 : ILOpCode<68, "imin">;
+def IL_OP_I_MUL                 : ILOpCode<69, "imul">;
+def IL_OP_I_MUL_HIGH            : ILOpCode<70, "imul_high">;
+def IL_OP_I_NE                  : ILOpCode<71, "ine">;
+def IL_OP_I_NEGATE              : ILOpCode<72, "inegate">;
+def IL_OP_I_NOT                 : ILOpCode<73, "inot">;
+def IL_OP_I_OR                  : ILOpCode<74, "ior">;
+def IL_OP_I_SHL                 : ILOpCode<75, "ishl">;
+def IL_OP_I_SHR                 : ILOpCode<76, "ishr">;
+def IL_OP_I_XOR                 : ILOpCode<77, "ixor">;
+def IL_OP_IF_LOGICALNZ          : ILOpCode<78, "if_logicalnz">;
+def IL_OP_IF_LOGICALZ           : ILOpCode<79, "if_logicalz">;
+def IL_OP_IFC                   : ILOpCode<80, "ifc">;
+def IL_OP_ITOF                  : ILOpCode<81, "itof">;
+def IL_OP_LN                    : ILOpCode<82, "ln">;
+def IL_OP_LOG                   : ILOpCode<83, "log">;
+def IL_OP_LOG_VEC               : ILOpCode<84, "log_vec">;
+def IL_OP_LOOP                  : ILOpCode<85, "loop">;
+def IL_OP_LT                    : ILOpCode<86, "lt">;
+def IL_OP_MAD                   : ILOpCode<87, "mad_ieee">;
+def IL_OP_MAX                   : ILOpCode<88, "max_ieee">;
+def IL_OP_MIN                   : ILOpCode<89, "min_ieee">;
+def IL_OP_MOD                   : ILOpCode<90, "mod_ieee">;
+def IL_OP_MOV                   : ILOpCode<91, "mov">;
+def IL_OP_MUL_IEEE              : ILOpCode<92, "mul_ieee">;
+def IL_OP_NE                    : ILOpCode<93, "ne">;
+def IL_OP_NRM                   : ILOpCode<94, "nrm_nrm4_zeroop(zero)">;
+def IL_OP_POW                   : ILOpCode<95, "pow">;
+def IL_OP_RCP                   : ILOpCode<96, "rcp">;
+def IL_OP_RET                   : ILOpCode<97, "ret">;
+def IL_OP_RET_DYN               : ILOpCode<98, "ret_dyn">;
+def IL_OP_RET_LOGICALNZ         : ILOpCode<99, "ret_logicalnz">;
+def IL_OP_RET_LOGICALZ          : ILOpCode<100, "ret_logicalz">;
+def IL_OP_RND                   : ILOpCode<101, "rnd">;
+def IL_OP_ROUND_NEAR            : ILOpCode<102, "round_nearest">;
+def IL_OP_ROUND_NEG_INF         : ILOpCode<103, "round_neginf">;
+def IL_OP_ROUND_POS_INF         : ILOpCode<104, "round_plusinf">;
+def IL_OP_ROUND_ZERO            : ILOpCode<105, "round_z">;
+def IL_OP_RSQ                   : ILOpCode<106, "rsq">;
+def IL_OP_RSQ_VEC               : ILOpCode<107, "rsq_vec">;
+def IL_OP_SAMPLE                : ILOpCode<108, "sample">;
+def IL_OP_SAMPLE_L              : ILOpCode<109, "sample_l">;
+def IL_OP_SET                   : ILOpCode<110, "set">;
+def IL_OP_SGN                   : ILOpCode<111, "sgn">;
+def IL_OP_SIN                   : ILOpCode<112, "sin">;
+def IL_OP_SIN_VEC               : ILOpCode<113, "sin_vec">;
+def IL_OP_SUB                   : ILOpCode<114, "sub">;
+def IL_OP_SWITCH                : ILOpCode<115, "switch">;
+def IL_OP_TRC                   : ILOpCode<116, "trc">;
+def IL_OP_U_DIV                 : ILOpCode<117, "udiv">;
+def IL_OP_U_GE                  : ILOpCode<118, "uge">;
+def IL_OP_U_LT                  : ILOpCode<119, "ult">;
+def IL_OP_U_MAD                 : ILOpCode<120, "umad">;
+def IL_OP_U_MAX                 : ILOpCode<121, "umax">;
+def IL_OP_U_MIN                 : ILOpCode<122, "umin">;
+def IL_OP_U_MOD                 : ILOpCode<123, "umod">;
+def IL_OP_U_MUL                 : ILOpCode<124, "umul">;
+def IL_OP_U_MUL_HIGH            : ILOpCode<125, "umul_high">;
+def IL_OP_U_SHR                 : ILOpCode<126, "ushr">;
+def IL_OP_UTOF                  : ILOpCode<127, "utof">;
+def IL_OP_WHILE                 : ILOpCode<128, "whileloop">;
+// SC IL instructions that are not in CAL IL
+def IL_OP_ACOS                  : ILOpCode<129, "acos">;
+def IL_OP_ASIN                  : ILOpCode<130, "asin">;
+def IL_OP_EXN                   : ILOpCode<131, "exn">;
+def IL_OP_UBIT_REVERSE          : ILOpCode<132, "ubit_reverse">;
+def IL_OP_UBIT_EXTRACT          : ILOpCode<133, "ubit_extract">;
+def IL_OP_IBIT_EXTRACT          : ILOpCode<134, "ibit_extract">;
+def IL_OP_SQRT                  : ILOpCode<135, "sqrt">;
+def IL_OP_SQRT_VEC              : ILOpCode<136, "sqrt_vec">;
+def IL_OP_ATAN                  : ILOpCode<137, "atan">;
+def IL_OP_TAN                   : ILOpCode<137, "tan">;
+def IL_OP_D_DIV                 : ILOpCode<138, "ddiv">;
+def IL_OP_F_NEG                 : ILOpCode<139, "mov">;
+def IL_OP_GT                    : ILOpCode<140, "gt">;
+def IL_OP_LE                    : ILOpCode<141, "lt">;
+def IL_OP_DIST                  : ILOpCode<142, "dist">;
+def IL_OP_LEN                   : ILOpCode<143, "len">;
+def IL_OP_MACRO                 : ILOpCode<144, "mcall">;
+def IL_OP_INTR                  : ILOpCode<145, "call">;
+def IL_OP_I_FFB_HI              : ILOpCode<146, "ffb_hi">;
+def IL_OP_I_FFB_LO              : ILOpCode<147, "ffb_lo">;
+def IL_OP_BARRIER               : ILOpCode<148, "fence_threads_memory_lds">;
+def IL_OP_BARRIER_LOCAL         : ILOpCode<149, "fence_threads_lds">;
+def IL_OP_BARRIER_GLOBAL        : ILOpCode<150, "fence_threads_memory">;
+def IL_OP_FENCE                 : ILOpCode<151, "fence_lds_memory">;
+def IL_OP_FENCE_READ_ONLY       : ILOpCode<152, "fence_lds_mem_read_only">;
+def IL_OP_FENCE_WRITE_ONLY      : ILOpCode<153, "fence_lds_mem_write_only">;
+def IL_PSEUDO_INST              : ILOpCode<154, ";Pseudo Op">;
+def IL_OP_UNPACK_0              : ILOpCode<155, "unpack0">;
+def IL_OP_UNPACK_1              : ILOpCode<156, "unpack1">;
+def IL_OP_UNPACK_2              : ILOpCode<157, "unpack2">;
+def IL_OP_UNPACK_3              : ILOpCode<158, "unpack3">;
+def IL_OP_PI_REDUCE             : ILOpCode<159, "pireduce">;
+def IL_OP_IBIT_COUNT            : ILOpCode<160, "icbits">;
+def IL_OP_I_FFB_SGN             : ILOpCode<161, "ffb_shi">;
+def IL_OP_F2U4                  : ILOpCode<162, "f_2_u4">;
+def IL_OP_BIT_ALIGN             : ILOpCode<163, "bitalign">;
+def IL_OP_BYTE_ALIGN            : ILOpCode<164, "bytealign">;
+def IL_OP_U4_LERP               : ILOpCode<165, "u4lerp">;
+def IL_OP_SAD                   : ILOpCode<166, "sad">;
+def IL_OP_SAD_HI                : ILOpCode<167, "sadhi">;
+def IL_OP_SAD4                  : ILOpCode<168, "sad4">;
+def IL_OP_UBIT_INSERT           : ILOpCode<169, "ubit_insert">;
+def IL_OP_I_CARRY               : ILOpCode<170, "icarry">;
+def IL_OP_I_BORROW              : ILOpCode<171, "iborrow">;
+def IL_OP_U_MAD24               : ILOpCode<172, "umad24">;
+def IL_OP_U_MUL24               : ILOpCode<173, "umul24">;
+def IL_OP_I_MAD24               : ILOpCode<174, "imad24">;
+def IL_OP_I_MUL24               : ILOpCode<175, "imul24">;
+def IL_OP_CLAMP                 : ILOpCode<176, "clamp">;
+def IL_OP_LERP                  : ILOpCode<177, "lrp">;
+def IL_OP_FMA                   : ILOpCode<178, "fma">;
+def IL_OP_D_MIN                 : ILOpCode<179, "dmin">;
+def IL_OP_D_MAX                 : ILOpCode<180, "dmax">;
+def IL_OP_D_SQRT                : ILOpCode<181, "dsqrt">;
+def IL_OP_DP2_ADD               : ILOpCode<182, "dp2add">;
+def IL_OP_F16_TO_F32            : ILOpCode<183, "f162f">;
+def IL_OP_F32_TO_F16            : ILOpCode<184, "f2f16">;
+def IL_REG_LOCAL_ID_FLAT        : ILOpCode<185, "vTidInGrpFlat">;
+def IL_REG_LOCAL_ID             : ILOpCode<186, "vTidInGrp">;
+def IL_REG_GLOBAL_ID_FLAT       : ILOpCode<187, "vAbsTidFlag">;
+def IL_REG_GLOBAL_ID            : ILOpCode<188, "vAbsTid">;
+def IL_REG_GROUP_ID_FLAT        : ILOpCode<189, "vThreadGrpIDFlat">;
+def IL_REG_GROUP_ID             : ILOpCode<190, "vThreadGrpID">;
+def IL_OP_D_RCP                 : ILOpCode<191, "drcp_zeroop(infinity)">;
+def IL_OP_D_RSQ                 : ILOpCode<192, "drsq_zeroop(infinity)">;
+def IL_OP_D_MOV                 : ILOpCode<193, "dmov">;
+def IL_OP_D_MOVC                : ILOpCode<194, "dmovc">;
+def IL_OP_NOP                   : ILOpCode<195, "nop">;
+def IL_OP_UAV_ADD               : ILOpCode<196, "uav_add">;
+def IL_OP_UAV_AND               : ILOpCode<197, "uav_and">;
+def IL_OP_UAV_MAX               : ILOpCode<198, "uav_max">;
+def IL_OP_UAV_MIN               : ILOpCode<199, "uav_min">;
+def IL_OP_UAV_OR                : ILOpCode<200, "uav_or">;
+def IL_OP_UAV_RSUB              : ILOpCode<201, "uav_rsub">;
+def IL_OP_UAV_SUB               : ILOpCode<202, "uav_sub">;
+def IL_OP_UAV_UMAX              : ILOpCode<203, "uav_umax">;
+def IL_OP_UAV_UMIN              : ILOpCode<204, "uav_umin">;
+def IL_OP_UAV_XOR               : ILOpCode<205, "uav_xor">;
+def IL_OP_UAV_INC               : ILOpCode<206, "uav_uinc">;
+def IL_OP_UAV_DEC               : ILOpCode<207, "uav_udec">;
+def IL_OP_UAV_CMP               : ILOpCode<208, "uav_cmp">;
+def IL_OP_UAV_READ_ADD          : ILOpCode<209, "uav_read_add">;
+def IL_OP_UAV_READ_AND          : ILOpCode<210, "uav_read_and">;
+def IL_OP_UAV_READ_MAX          : ILOpCode<211, "uav_read_max">;
+def IL_OP_UAV_READ_MIN          : ILOpCode<212, "uav_read_min">;
+def IL_OP_UAV_READ_OR           : ILOpCode<213, "uav_read_or">;
+def IL_OP_UAV_READ_RSUB         : ILOpCode<214, "uav_read_rsub">;
+def IL_OP_UAV_READ_SUB          : ILOpCode<215, "uav_read_sub">;
+def IL_OP_UAV_READ_UMAX         : ILOpCode<216, "uav_read_umax">;
+def IL_OP_UAV_READ_UMIN         : ILOpCode<217, "uav_read_umin">;
+def IL_OP_UAV_READ_XOR          : ILOpCode<218, "uav_read_xor">;
+def IL_OP_UAV_READ_INC          : ILOpCode<219, "uav_read_uinc">;
+def IL_OP_UAV_READ_DEC          : ILOpCode<220, "uav_read_udec">;
+def IL_OP_UAV_READ_XCHG         : ILOpCode<221, "uav_read_xchg">;
+def IL_OP_UAV_READ_CMPXCHG      : ILOpCode<222, "uav_read_cmp_xchg">;
+def IL_OP_LDS_ADD               : ILOpCode<223, "lds_add">;
+def IL_OP_LDS_AND               : ILOpCode<224, "lds_and">;
+def IL_OP_LDS_MAX               : ILOpCode<225, "lds_max">;
+def IL_OP_LDS_MIN               : ILOpCode<226, "lds_min">;
+def IL_OP_LDS_OR                : ILOpCode<227, "lds_or">;
+def IL_OP_LDS_RSUB              : ILOpCode<228, "lds_rsub">;
+def IL_OP_LDS_SUB               : ILOpCode<229, "lds_sub">;
+def IL_OP_LDS_UMAX              : ILOpCode<230, "lds_umax">;
+def IL_OP_LDS_UMIN              : ILOpCode<231, "lds_umin">;
+def IL_OP_LDS_XOR               : ILOpCode<232, "lds_xor">;
+def IL_OP_LDS_INC               : ILOpCode<233, "lds_inc">;
+def IL_OP_LDS_DEC               : ILOpCode<234, "lds_dec">;
+def IL_OP_LDS_CMP               : ILOpCode<235, "lds_cmp">;
+def IL_OP_LDS_READ_ADD          : ILOpCode<236, "lds_read_add">;
+def IL_OP_LDS_READ_AND          : ILOpCode<237, "lds_read_and">;
+def IL_OP_LDS_READ_MAX          : ILOpCode<238, "lds_read_max">;
+def IL_OP_LDS_READ_MIN          : ILOpCode<239, "lds_read_min">;
+def IL_OP_LDS_READ_OR           : ILOpCode<240, "lds_read_or">;
+def IL_OP_LDS_READ_RSUB         : ILOpCode<241, "lds_read_rsub">;
+def IL_OP_LDS_READ_SUB          : ILOpCode<242, "lds_read_sub">;
+def IL_OP_LDS_READ_UMAX         : ILOpCode<243, "lds_read_umax">;
+def IL_OP_LDS_READ_UMIN         : ILOpCode<244, "lds_read_umin">;
+def IL_OP_LDS_READ_XOR          : ILOpCode<245, "lds_read_xor">;
+def IL_OP_LDS_READ_INC          : ILOpCode<246, "lds_read_inc">;
+def IL_OP_LDS_READ_DEC          : ILOpCode<247, "lds_read_dec">;
+def IL_OP_LDS_READ_XCHG         : ILOpCode<248, "lds_read_xchg">;
+def IL_OP_LDS_READ_CMPXCHG      : ILOpCode<249, "lds_read_cmp_xchg">;
+def IL_OP_GDS_ADD               : ILOpCode<250, "gds_add">;
+def IL_OP_GDS_AND               : ILOpCode<251, "gds_and">;
+def IL_OP_GDS_MAX               : ILOpCode<252, "gds_max">;
+def IL_OP_GDS_MIN               : ILOpCode<253, "gds_min">;
+def IL_OP_GDS_OR                : ILOpCode<254, "gds_or">;
+def IL_OP_GDS_RSUB              : ILOpCode<255, "gds_rsub">;
+def IL_OP_GDS_SUB               : ILOpCode<256, "gds_sub">;
+def IL_OP_GDS_UMAX              : ILOpCode<257, "gds_umax">;
+def IL_OP_GDS_UMIN              : ILOpCode<258, "gds_umin">;
+def IL_OP_GDS_MSKOR             : ILOpCode<259, "gds_mskor">;
+def IL_OP_GDS_XOR               : ILOpCode<260, "gds_xor">;
+def IL_OP_GDS_INC               : ILOpCode<261, "gds_inc">;
+def IL_OP_GDS_DEC               : ILOpCode<262, "gds_dec">;
+def IL_OP_GDS_CMP               : ILOpCode<263, "gds_cmp">;
+def IL_OP_GDS_READ_ADD          : ILOpCode<264, "gds_read_add">;
+def IL_OP_GDS_READ_AND          : ILOpCode<265, "gds_read_and">;
+def IL_OP_GDS_READ_MAX          : ILOpCode<266, "gds_read_max">;
+def IL_OP_GDS_READ_MIN          : ILOpCode<267, "gds_read_min">;
+def IL_OP_GDS_READ_OR           : ILOpCode<268, "gds_read_or">;
+def IL_OP_GDS_READ_RSUB         : ILOpCode<269, "gds_read_rsub">;
+def IL_OP_GDS_READ_SUB          : ILOpCode<270, "gds_read_sub">;
+def IL_OP_GDS_READ_UMAX         : ILOpCode<271, "gds_read_umax">;
+def IL_OP_GDS_READ_UMIN         : ILOpCode<272, "gds_read_umin">;
+def IL_OP_GDS_READ_MSKOR        : ILOpCode<273, "gds_read_mskor">;
+def IL_OP_GDS_READ_XOR          : ILOpCode<274, "gds_read_xor">;
+def IL_OP_GDS_READ_INC          : ILOpCode<275, "gds_read_inc">;
+def IL_OP_GDS_READ_DEC          : ILOpCode<276, "gds_read_dec">;
+def IL_OP_GDS_READ_XCHG         : ILOpCode<277, "gds_read_xchg">;
+def IL_OP_GDS_READ_CMPXCHG      : ILOpCode<278, "gds_read_cmp_xchg">;
+def IL_OP_APPEND_BUF_ALLOC      : ILOpCode<279, "append_buf_alloc">;
+def IL_OP_APPEND_BUF_CONSUME    : ILOpCode<280, "append_buf_consume">;
+def IL_OP_I64_ADD               : ILOpCode<281, "i64add">;
+def IL_OP_I64_MAX               : ILOpCode<282, "i64max">;
+def IL_OP_U64_MAX               : ILOpCode<283, "u64max">;
+def IL_OP_I64_MIN               : ILOpCode<284, "i64min">;
+def IL_OP_U64_MIN               : ILOpCode<285, "u64min">;
+def IL_OP_I64_NEGATE            : ILOpCode<286, "i64negate">;
+def IL_OP_I64_SHL               : ILOpCode<287, "i64shl">;
+def IL_OP_I64_SHR               : ILOpCode<288, "i64shr">;
+def IL_OP_U64_SHR               : ILOpCode<289, "u64shr">;
+def IL_OP_I64_EQ                : ILOpCode<290, "i64eq">;
+def IL_OP_I64_GE                : ILOpCode<291, "i64ge">;
+def IL_OP_U64_GE                : ILOpCode<292, "u64ge">;
+def IL_OP_I64_LT                : ILOpCode<293, "i64lt">;
+def IL_OP_U64_LT                : ILOpCode<294, "u64lt">;
+def IL_OP_I64_NE                : ILOpCode<295, "i64ne">;
+def IL_OP_U_MULHI24             : ILOpCode<296, "umul24_high">;
+def IL_OP_I_MULHI24             : ILOpCode<297, "imul24_high">;
+def IL_OP_GDS_LOAD              : ILOpCode<298, "gds_load">;
+def IL_OP_GDS_STORE             : ILOpCode<299, "gds_store">;
+def IL_OP_LDS_LOAD              : ILOpCode<300, "lds_load">;
+def IL_OP_LDS_LOAD_VEC          : ILOpCode<301, "lds_load_vec">;
+def IL_OP_LDS_LOAD_BYTE         : ILOpCode<302, "lds_load_byte">;
+def IL_OP_LDS_LOAD_UBYTE        : ILOpCode<303, "lds_load_ubyte">;
+def IL_OP_LDS_LOAD_SHORT        : ILOpCode<304, "lds_load_short">;
+def IL_OP_LDS_LOAD_USHORT       : ILOpCode<305, "lds_load_ushort">;
+def IL_OP_LDS_STORE             : ILOpCode<306, "lds_store">;
+def IL_OP_LDS_STORE_VEC         : ILOpCode<307, "lds_store_vec">;
+def IL_OP_LDS_STORE_BYTE        : ILOpCode<308, "lds_store_byte">;
+def IL_OP_LDS_STORE_SHORT       : ILOpCode<309, "lds_store_short">;
+def IL_OP_RAW_UAV_LOAD          : ILOpCode<310, "uav_raw_load">;
+def IL_OP_RAW_UAV_STORE         : ILOpCode<311, "uav_raw_store">;
+def IL_OP_ARENA_UAV_LOAD        : ILOpCode<312, "uav_arena_load">;
+def IL_OP_ARENA_UAV_STORE       : ILOpCode<313, "uav_arena_store">;
+def IL_OP_LDS_MSKOR             : ILOpCode<314, "lds_mskor">;
+def IL_OP_LDS_READ_MSKOR        : ILOpCode<315, "lds_read_mskor">;
+def IL_OP_UAV_BYTE_LOAD         : ILOpCode<316, "uav_byte_load">;
+def IL_OP_UAV_UBYTE_LOAD        : ILOpCode<317, "uav_ubyte_load">;
+def IL_OP_UAV_SHORT_LOAD        : ILOpCode<318, "uav_short_load">;
+def IL_OP_UAV_USHORT_LOAD       : ILOpCode<319, "uav_ushort_load">;
+def IL_OP_UAV_BYTE_STORE        : ILOpCode<320, "uav_byte_store">;
+def IL_OP_UAV_SHORT_STORE       : ILOpCode<320, "uav_short_store">;
+def IL_OP_UAV_STORE             : ILOpCode<321, "uav_store">;
+def IL_OP_UAV_LOAD              : ILOpCode<322, "uav_load">;
+def IL_OP_MUL                   : ILOpCode<323, "mul">;
+def IL_OP_DIV_INF               : ILOpCode<324, "div_zeroop(infinity)">;
+def IL_OP_DIV_FLTMAX            : ILOpCode<325, "div_zeroop(fltmax)">;
+def IL_OP_DIV_ZERO              : ILOpCode<326, "div_zeroop(zero)">;
+def IL_OP_DIV_INFELSEMAX        : ILOpCode<327, "div_zeroop(inf_else_max)">;
+def IL_OP_FTOI_FLR              : ILOpCode<328, "ftoi_flr">;
+def IL_OP_FTOI_RPI              : ILOpCode<329, "ftoi_rpi">;
+def IL_OP_F32_TO_F16_NEAR       : ILOpCode<330, "f2f16_near">;
+def IL_OP_F32_TO_F16_NEG_INF    : ILOpCode<331, "f2f16_neg_inf">;
+def IL_OP_F32_TO_F16_PLUS_INF   : ILOpCode<332, "f2f16_plus_inf">;
+def IL_OP_I64_MUL               : ILOpCode<333, "i64mul">;
+def IL_OP_U64_MUL               : ILOpCode<334, "u64mul">;
+def IL_OP_CU_ID                 : ILOpCode<355, "cu_id">;
+def IL_OP_WAVE_ID               : ILOpCode<356, "wave_id">;
+def IL_OP_I64_SUB               : ILOpCode<357, "i64sub">;
+def IL_OP_I64_DIV               : ILOpCode<358, "i64div">;
+def IL_OP_U64_DIV               : ILOpCode<359, "u64div">;
+def IL_OP_I64_MOD               : ILOpCode<360, "i64mod">;
+def IL_OP_U64_MOD               : ILOpCode<361, "u64mod">;
+def IL_DCL_GWS_THREAD_COUNT     : ILOpCode<362, "dcl_gws_thread_count">;
+def IL_DCL_SEMAPHORE            : ILOpCode<363, "dcl_semaphore">;
+def IL_OP_SEMAPHORE_INIT        : ILOpCode<364, "init_semaphore">;
+def IL_OP_SEMAPHORE_WAIT        : ILOpCode<365, "semaphore_wait">;
+def IL_OP_SEMAPHORE_SIGNAL      : ILOpCode<366, "semaphore_signal">;
+def IL_OP_BARRIER_REGION        : ILOpCode<377, "fence_threads_gds">;
+def IL_OP_BFI                   : ILOpCode<394, "bfi">;
+def IL_OP_BFM                   : ILOpCode<395, "bfm">;
+def IL_DBG_STRING               : ILOpCode<396, "dbg_string">;
+def IL_DBG_LINE                 : ILOpCode<397, "dbg_line">;
+def IL_DBG_TEMPLOC              : ILOpCode<398, "dbg_temploc">;
diff --git a/lib/Target/AMDGPU/AMDILEvergreenDevice.cpp b/lib/Target/AMDGPU/AMDILEvergreenDevice.cpp
new file mode 100644 (file)
index 0000000..6652c74
--- /dev/null
@@ -0,0 +1,183 @@
+//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILEvergreenDevice.h"
+
+using namespace llvm;
+
+AMDILEvergreenDevice::AMDILEvergreenDevice(AMDILSubtarget *ST)
+: AMDILDevice(ST) {
+  setCaps();
+  std::string name = ST->getDeviceName();
+  if (name == "cedar") {
+    mDeviceFlag = OCL_DEVICE_CEDAR;
+  } else if (name == "redwood") {
+    mDeviceFlag = OCL_DEVICE_REDWOOD;
+  } else if (name == "cypress") {
+    mDeviceFlag = OCL_DEVICE_CYPRESS;
+  } else {
+    mDeviceFlag = OCL_DEVICE_JUNIPER;
+  }
+}
+
+AMDILEvergreenDevice::~AMDILEvergreenDevice() {
+}
+
+size_t AMDILEvergreenDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+size_t AMDILEvergreenDevice::getMaxGDSSize() const {
+  if (usesHardware(AMDILDeviceInfo::RegionMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+uint32_t AMDILEvergreenDevice::getMaxNumUAVs() const {
+  return 12;
+}
+
+uint32_t AMDILEvergreenDevice::getResourceID(uint32_t id) const {
+  switch(id) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+    if (mSTM->calVersion() >= CAL_VERSION_GLOBAL_RETURN_BUFFER) {
+      return GLOBAL_RETURN_RAW_UAV_ID;
+    } else {
+      return DEFAULT_RAW_UAV_ID;
+    }
+  case GLOBAL_ID:
+  case ARENA_UAV_ID:
+    return DEFAULT_ARENA_UAV_ID;
+  case LDS_ID:
+    if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case GDS_ID:
+    if (usesHardware(AMDILDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case SCRATCH_ID:
+    if (usesHardware(AMDILDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  };
+  return 0;
+}
+
+size_t AMDILEvergreenDevice::getWavefrontSize() const {
+  return AMDILDevice::WavefrontSize;
+}
+
+uint32_t AMDILEvergreenDevice::getGeneration() const {
+  return AMDILDeviceInfo::HD5XXX;
+}
+
+void AMDILEvergreenDevice::setCaps() {
+  mSWBits.set(AMDILDeviceInfo::ArenaSegment);
+  mHWBits.set(AMDILDeviceInfo::ArenaUAV);
+  if (mSTM->calVersion() >= CAL_VERSION_SC_140) {
+    mHWBits.set(AMDILDeviceInfo::HW64BitDivMod);
+    mSWBits.reset(AMDILDeviceInfo::HW64BitDivMod);
+  } 
+  mSWBits.set(AMDILDeviceInfo::Signed24BitOps);
+  if (mSTM->isOverride(AMDILDeviceInfo::ByteStores)) {
+    mHWBits.set(AMDILDeviceInfo::ByteStores);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::Debug)) {
+    mSWBits.set(AMDILDeviceInfo::LocalMem);
+    mSWBits.set(AMDILDeviceInfo::RegionMem);
+  } else {
+    mHWBits.set(AMDILDeviceInfo::LocalMem);
+    mHWBits.set(AMDILDeviceInfo::RegionMem);
+  }
+  mHWBits.set(AMDILDeviceInfo::Images);
+  if (mSTM->isOverride(AMDILDeviceInfo::NoAlias)) {
+    mHWBits.set(AMDILDeviceInfo::NoAlias);
+  }
+  if (mSTM->calVersion() > CAL_VERSION_GLOBAL_RETURN_BUFFER) {
+    mHWBits.set(AMDILDeviceInfo::CachedMem);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::MultiUAV)) {
+    mHWBits.set(AMDILDeviceInfo::MultiUAV);
+  }
+  if (mSTM->calVersion() > CAL_VERSION_SC_136) {
+    mHWBits.set(AMDILDeviceInfo::ByteLDSOps);
+    mSWBits.reset(AMDILDeviceInfo::ByteLDSOps);
+    mHWBits.set(AMDILDeviceInfo::ArenaVectors);
+  } else {
+    mSWBits.set(AMDILDeviceInfo::ArenaVectors);
+  }
+  if (mSTM->calVersion() > CAL_VERSION_SC_137) {
+    mHWBits.set(AMDILDeviceInfo::LongOps);
+    mSWBits.reset(AMDILDeviceInfo::LongOps);
+  }
+  mHWBits.set(AMDILDeviceInfo::TmrReg);
+}
+
+AMDILCypressDevice::AMDILCypressDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDILCypressDevice::~AMDILCypressDevice() {
+}
+
+void AMDILCypressDevice::setCaps() {
+  if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDILDeviceInfo::DoubleOps);
+    mHWBits.set(AMDILDeviceInfo::FMA);
+  }
+}
+
+
+AMDILCedarDevice::AMDILCedarDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDILCedarDevice::~AMDILCedarDevice() {
+}
+
+void AMDILCedarDevice::setCaps() {
+  mSWBits.set(AMDILDeviceInfo::FMA);
+}
+
+size_t AMDILCedarDevice::getWavefrontSize() const {
+  return AMDILDevice::QuarterWavefrontSize;
+}
+
+AMDILRedwoodDevice::AMDILRedwoodDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDILRedwoodDevice::~AMDILRedwoodDevice()
+{
+}
+
+void AMDILRedwoodDevice::setCaps() {
+  mSWBits.set(AMDILDeviceInfo::FMA);
+}
+
+size_t AMDILRedwoodDevice::getWavefrontSize() const {
+  return AMDILDevice::HalfWavefrontSize;
+}
diff --git a/lib/Target/AMDGPU/AMDILEvergreenDevice.h b/lib/Target/AMDGPU/AMDILEvergreenDevice.h
new file mode 100644 (file)
index 0000000..2639ab8
--- /dev/null
@@ -0,0 +1,87 @@
+//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===----------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILEVERGREENDEVICE_H_
+#define _AMDILEVERGREENDEVICE_H_
+#include "AMDILDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+//===----------------------------------------------------------------------===//
+// Evergreen generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+
+// The AMDILEvergreenDevice is the base device class for all of the Evergreen
+// series of cards. This class contains information required to differentiate
+// the Evergreen device from the generic AMDILDevice. This device represents
+// that capabilities of the 'Juniper' cards, also known as the HD57XX.
+class AMDILEvergreenDevice : public AMDILDevice {
+public:
+  AMDILEvergreenDevice(AMDILSubtarget *ST);
+  virtual ~AMDILEvergreenDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getMaxGDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getMaxNumUAVs() const;
+  virtual uint32_t getResourceID(uint32_t) const;
+protected:
+  virtual void setCaps();
+}; // AMDILEvergreenDevice
+
+// The AMDILCypressDevice is similiar to the AMDILEvergreenDevice, except it has
+// support for double precision operations. This device is used to represent
+// both the Cypress and Hemlock cards, which are commercially known as HD58XX
+// and HD59XX cards.
+class AMDILCypressDevice : public AMDILEvergreenDevice {
+public:
+  AMDILCypressDevice(AMDILSubtarget *ST);
+  virtual ~AMDILCypressDevice();
+private:
+  virtual void setCaps();
+}; // AMDILCypressDevice
+
+
+// The AMDILCedarDevice is the class that represents all of the 'Cedar' based
+// devices. This class differs from the base AMDILEvergreenDevice in that the
+// device is a ~quarter of the 'Juniper'. These are commercially known as the
+// HD54XX and HD53XX series of cards.
+class AMDILCedarDevice : public AMDILEvergreenDevice {
+public:
+  AMDILCedarDevice(AMDILSubtarget *ST);
+  virtual ~AMDILCedarDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+}; // AMDILCedarDevice
+
+// The AMDILRedwoodDevice is the class the represents all of the 'Redwood' based
+// devices. This class differs from the base class, in that these devices are
+// considered about half of a 'Juniper' device. These are commercially known as
+// the HD55XX and HD56XX series of cards.
+class AMDILRedwoodDevice : public AMDILEvergreenDevice {
+public:
+  AMDILRedwoodDevice(AMDILSubtarget *ST);
+  virtual ~AMDILRedwoodDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+}; // AMDILRedwoodDevice
+  
+} // namespace llvm
+#endif // _AMDILEVERGREENDEVICE_H_
diff --git a/lib/Target/AMDGPU/AMDILFormats.td b/lib/Target/AMDGPU/AMDILFormats.td
new file mode 100644 (file)
index 0000000..5a71ded
--- /dev/null
@@ -0,0 +1,175 @@
+//==- AMDILFormats.td - AMDIL Instruction Formats ----*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//===--------------------------------------------------------------------===//
+include "AMDILTokenDesc.td"
+
+//===--------------------------------------------------------------------===//
+// The parent IL instruction class that inherits the Instruction class. This
+// class sets the corresponding namespace, the out and input dag lists the
+// pattern to match to and the string to print out for the assembly printer.
+//===--------------------------------------------------------------------===//
+class ILFormat<ILOpCode op, dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     ILOpCode operation = op;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// Class that has one input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0" and
+// handles the unary math operators.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative for input and output register 0.
+//===--------------------------------------------------------------------===//
+class OneInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : ILFormat<op, outs, ins, asmstr, pattern>
+{
+     ILDst       dst_reg;
+     ILDstMod    dst_mod;
+     ILRelAddr   dst_rel;
+     ILSrc       dst_reg_rel;
+     ILSrcMod    dst_reg_rel_mod;
+     ILSrc       src0_reg;
+     ILSrcMod    src0_mod;
+     ILRelAddr   src0_rel;
+     ILSrc       src0_reg_rel;
+     ILSrcMod    src0_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// This class is similiar to the UnaryOp class, however, there is no
+// result value to assign.
+//===--------------------------------------------------------------------===//
+class UnaryOpNoRet<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : ILFormat<op, outs, ins, asmstr, pattern>
+{
+     ILSrc       src0_reg;
+     ILSrcMod    src0_mod;
+     ILRelAddr   src0_rel;
+     ILSrc       src0_reg_rel;
+     ILSrcMod    src0_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// Set of classes that have two input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0, Src1" and
+// handles the binary math operators and comparison operations.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative for input register 1.
+//===--------------------------------------------------------------------===//
+class TwoInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : OneInOneOut<op, outs, ins, asmstr, pattern>
+{
+     ILSrc       src1_reg;
+     ILSrcMod    src1_mod;
+     ILRelAddr   src1_rel;
+     ILSrc       src1_reg_rel;
+     ILSrcMod    src1_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// Similiar to the UnaryOpNoRet class, but takes as arguments two input
+// operands. Used mainly for barrier instructions on PC platform.
+//===--------------------------------------------------------------------===//
+class BinaryOpNoRet<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : UnaryOpNoRet<op, outs, ins, asmstr, pattern>
+{
+     ILSrc       src1_reg;
+     ILSrcMod    src1_mod;
+     ILRelAddr   src1_rel;
+     ILSrc       src1_reg_rel;
+     ILSrcMod    src1_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// Set of classes that have three input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0, Src1, Src2" and
+// handles the mad and conditional mov instruction.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative.
+// This class is the parent class of TernaryOp
+//===--------------------------------------------------------------------===//
+class ThreeInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : TwoInOneOut<op, outs, ins, asmstr, pattern> {
+           ILSrc       src2_reg;
+           ILSrcMod    src2_mod;
+           ILRelAddr   src2_rel;
+           ILSrc       src2_reg_rel;
+           ILSrcMod    src2_reg_rel_mod;
+      }
+
+//===--------------------------------------------------------------------===//
+// Intrinsic classes
+// Generic versions of the above classes but for Target specific intrinsics
+// instead of SDNode patterns.
+//===--------------------------------------------------------------------===//
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+     class VoidIntLong :
+          Intrinsic<[llvm_i64_ty], [], []>;
+     class VoidIntInt :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class VoidIntBool :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class UnaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+     class UnaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+     class ConvertIntFTOI :
+          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+     class ConvertIntITOF :
+          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
+     class UnaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty], []>;
+     class UnaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty], []>;
+     class BinaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class BinaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class BinaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
+     class BinaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
+     class TernaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class TernaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class QuaternaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class UnaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+     class UnaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
diff --git a/lib/Target/AMDGPU/AMDILFrameLowering.cpp b/lib/Target/AMDGPU/AMDILFrameLowering.cpp
new file mode 100644 (file)
index 0000000..87eca87
--- /dev/null
@@ -0,0 +1,53 @@
+//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDILFrameLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+using namespace llvm;
+AMDILFrameLowering::AMDILFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl)
+{
+}
+
+AMDILFrameLowering::~AMDILFrameLowering()
+{
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AMDILFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getObjectOffset(FI);
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDILFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const
+{
+  NumEntries = 0;
+  return 0;
+}
+void
+AMDILFrameLowering::emitPrologue(MachineFunction &MF) const
+{
+}
+void
+AMDILFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+{
+}
+bool
+AMDILFrameLowering::hasFP(const MachineFunction &MF) const
+{
+  return false;
+}
diff --git a/lib/Target/AMDGPU/AMDILFrameLowering.h b/lib/Target/AMDGPU/AMDILFrameLowering.h
new file mode 100644 (file)
index 0000000..b1d919e
--- /dev/null
@@ -0,0 +1,46 @@
+//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILFRAME_LOWERING_H_
+#define _AMDILFRAME_LOWERING_H_
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+/// Information about the stack frame layout on the AMDIL targets. It holds
+/// the direction of the stack growth, the known stack alignment on entry to
+/// each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+
+namespace llvm {
+  class AMDILFrameLowering : public TargetFrameLowering {
+    public:
+      AMDILFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned
+          TransAl = 1);
+      virtual ~AMDILFrameLowering();
+      virtual int getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const;
+      virtual const SpillSlot *
+        getCalleeSavedSpillSlots(unsigned &NumEntries) const;
+      virtual void emitPrologue(MachineFunction &MF) const;
+      virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+      virtual bool hasFP(const MachineFunction &MF) const;
+  }; // class AMDILFrameLowering
+} // namespace llvm
+#endif // _AMDILFRAME_LOWERING_H_
diff --git a/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDILISelDAGToDAG.cpp
new file mode 100644 (file)
index 0000000..df0ac75
--- /dev/null
@@ -0,0 +1,393 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AMDIL target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDILDevices.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include <list>
+#include <queue>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDILDAGToDAGISel - AMDIL specific code to select AMDIL machine instructions
+// //for SelectionDAG operations.
+//
+namespace {
+class AMDILDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDIL Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDILSubtarget &Subtarget;
+public:
+  AMDILDAGToDAGISel(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+  virtual ~AMDILDAGToDAGISel();
+
+  SDNode *Select(SDNode *N);
+  virtual const char *getPassName() const;
+
+private:
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static const Value *getBasePointerValue(const Value *V);
+
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  static bool isCPLoad(const LoadSDNode *N);
+  static bool isConstantLoad(const LoadSDNode *N, int cbID);
+  static bool isGlobalLoad(const LoadSDNode *N);
+  static bool isPrivateLoad(const LoadSDNode *N);
+  static bool isLocalLoad(const LoadSDNode *N);
+  static bool isRegionLoad(const LoadSDNode *N);
+
+  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
+  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+// createAMDILISelDag - This pass converts a legalized DAG into a AMDIL-specific
+// DAG, ready for instruction scheduling.
+//
+FunctionPass *llvm::createAMDILISelDag(TargetMachine &TM
+                                        AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILDAGToDAGISel(TM AMDIL_OPT_LEVEL_VAR);
+}
+
+AMDILDAGToDAGISel::AMDILDAGToDAGISel(TargetMachine &TM
+                                      AMDIL_OPT_LEVEL_DECL)
+  : SelectionDAGISel(TM AMDIL_OPT_LEVEL_VAR), Subtarget(TM.getSubtarget<AMDILSubtarget>())
+{
+}
+
+AMDILDAGToDAGISel::~AMDILDAGToDAGISel() {
+}
+
+SDValue AMDILDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
+  return CurDAG->getTargetConstant(Imm, MVT::i32);
+}
+
+bool AMDILDAGToDAGISel::SelectADDRParam(
+    SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+  return true;
+}
+
+bool AMDILDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDILDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDILDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+  switch (Opc) {
+  default: break;
+  case ISD::FrameIndex:
+    {
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
+        unsigned int FI = FIN->getIndex();
+        EVT OpVT = N->getValueType(0);
+        unsigned int NewOpc = AMDGPU::COPY;
+        SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
+        return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
+      }
+    }
+    break;
+  }
+  return SelectCode(N);
+}
+
+bool AMDILDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+const Value * AMDILDAGToDAGISel::getBasePointerValue(const Value *V)
+{
+  if (!V) {
+    return NULL;
+  }
+  const Value *ret = NULL;
+  ValueMap<const Value *, bool> ValueBitMap;
+  std::queue<const Value *, std::list<const Value *> > ValueQueue;
+  ValueQueue.push(V);
+  while (!ValueQueue.empty()) {
+    V = ValueQueue.front();
+    if (ValueBitMap.find(V) == ValueBitMap.end()) {
+      ValueBitMap[V] = true;
+      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
+        ret = V;
+        break;
+      } else if (dyn_cast<GlobalVariable>(V)) {
+        ret = V;
+        break;
+      } else if (dyn_cast<Constant>(V)) {
+        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
+        if (CE) {
+          ValueQueue.push(CE->getOperand(0));
+        }
+      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+        ret = AI;
+        break;
+      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
+        uint32_t numOps = I->getNumOperands();
+        for (uint32_t x = 0; x < numOps; ++x) {
+          ValueQueue.push(I->getOperand(x));
+        }
+      } else {
+        // assert(0 && "Found a Value that we didn't know how to handle!");
+      }
+    }
+    ValueQueue.pop();
+  }
+  return ret;
+}
+
+bool AMDILDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  return (!checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS));
+}
+
+bool AMDILDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
+  if (checkType(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) {
+    return true;
+  }
+  MachineMemOperand *MMO = N->getMemOperand();
+  const Value *V = MMO->getValue();
+  const Value *BV = getBasePointerValue(V);
+  if (MMO
+      && MMO->getValue()
+      && ((V && dyn_cast<GlobalValue>(V))
+          || (BV && dyn_cast<GlobalValue>(
+                        getBasePointerValue(MMO->getValue()))))) {
+    return checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS);
+  } else {
+    return false;
+  }
+}
+
+bool AMDILDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+    if (MMO) {
+      const Value *V = MMO->getValue();
+      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDILDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
+  if (checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+  if (!checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS))
+  {
+    return true;
+  }
+  return false;
+}
+
+const char *AMDILDAGToDAGISel::getPassName() const {
+  return "AMDIL DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+///==== AMDGPU Functions ====///
+
+bool AMDILDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
+                                             SDValue& Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    bool Match = false;
+
+    // Find the base ptr and the offset
+    for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
+      SDValue Arg = Addr.getOperand(i);
+      ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
+      // This arg isn't a constant so it must be the base PTR.
+      if (!OffsetNode) {
+        Base = Addr.getOperand(i);
+        continue;
+      }
+      // Check if the constant argument fits in 8-bits.  The offset is in bytes
+      // so we need to convert it to dwords.
+      if (isInt<8>(OffsetNode->getZExtValue() >> 2)) {
+        Match = true;
+        Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
+                                           MVT::i32);
+      }
+    }
+    return Match;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDILDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset)
+{
+  ConstantSDNode * IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  CurDAG->getEntryNode().getDebugLoc(),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDILDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
+                                      SDValue& Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress  ||
+      Addr.getOpcode() != ISD::ADD) {
+    return false;
+  }
+
+  Base = Addr.getOperand(0);
+  Offset = Addr.getOperand(1);
+
+  return false;
+}
diff --git a/lib/Target/AMDGPU/AMDILISelLowering.cpp b/lib/Target/AMDGPU/AMDILISelLowering.cpp
new file mode 100644 (file)
index 0000000..af99122
--- /dev/null
@@ -0,0 +1,1850 @@
+//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that AMDIL uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILISelLowering.h"
+#include "AMDILDevices.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDILRegisterInfo.h"
+#include "AMDILSubtarget.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+#define ISDBITCAST  ISD::BITCAST
+#define MVTGLUE     MVT::Glue
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+#include "AMDGPUGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation Help Functions Begin
+//===----------------------------------------------------------------------===//
+  static SDValue
+getConversionNode(SelectionDAG &DAG, SDValue& Src, SDValue& Dst, bool asType)
+{
+  DebugLoc DL = Src.getDebugLoc();
+  EVT svt = Src.getValueType().getScalarType();
+  EVT dvt = Dst.getValueType().getScalarType();
+  if (svt.isFloatingPoint() && dvt.isFloatingPoint()) {
+    if (dvt.bitsGT(svt)) {
+      Src = DAG.getNode(ISD::FP_EXTEND, DL, dvt, Src);
+    } else if (svt.bitsLT(svt)) {
+      Src = DAG.getNode(ISD::FP_ROUND, DL, dvt, Src,
+          DAG.getConstant(1, MVT::i32));
+    }
+  } else if (svt.isInteger() && dvt.isInteger()) {
+    if (!svt.bitsEq(dvt)) {
+      Src = DAG.getSExtOrTrunc(Src, DL, dvt);
+    }
+  } else if (svt.isInteger()) {
+    unsigned opcode = (asType) ? ISDBITCAST : ISD::SINT_TO_FP;
+    if (!svt.bitsEq(dvt)) {
+      if (dvt.getSimpleVT().SimpleTy == MVT::f32) {
+        Src = DAG.getSExtOrTrunc(Src, DL, MVT::i32);
+      } else if (dvt.getSimpleVT().SimpleTy == MVT::f64) {
+        Src = DAG.getSExtOrTrunc(Src, DL, MVT::i64);
+      } else {
+        assert(0 && "We only support 32 and 64bit fp types");
+      }
+    }
+    Src = DAG.getNode(opcode, DL, dvt, Src);
+  } else if (dvt.isInteger()) {
+    unsigned opcode = (asType) ? ISDBITCAST : ISD::FP_TO_SINT;
+    if (svt.getSimpleVT().SimpleTy == MVT::f32) {
+      Src = DAG.getNode(opcode, DL, MVT::i32, Src);
+    } else if (svt.getSimpleVT().SimpleTy == MVT::f64) {
+      Src = DAG.getNode(opcode, DL, MVT::i64, Src);
+    } else {
+      assert(0 && "We only support 32 and 64bit fp types");
+    }
+    Src = DAG.getSExtOrTrunc(Src, DL, dvt);
+  }
+  return Src;
+}
+// CondCCodeToCC - Convert a DAG condition code to a AMDIL CC
+// condition.
+  static AMDILCC::CondCodes
+CondCCodeToCC(ISD::CondCode CC, const MVT::SimpleValueType& type)
+{
+  switch (CC) {
+    default:
+      {
+        errs()<<"Condition Code: "<< (unsigned int)CC<<"\n";
+        assert(0 && "Unknown condition code!");
+      }
+    case ISD::SETO:
+      switch(type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_O;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_O;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUO:
+      switch(type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UO;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UO;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETGT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_GT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_GT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_GT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_GT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETGE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_GE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_GE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_GE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_GE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETLT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_LT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_LT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_LT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_LT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETLE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_LE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_LE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_LE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_LE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETNE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_NE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_NE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_NE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_NE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETEQ:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_EQ;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_EQ;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_EQ;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_EQ;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUGT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_GT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UGT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UGT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_GT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUGE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_GE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UGE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UGE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_GE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETULT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_LT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_ULT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_ULT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_LT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETULE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_LE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_ULE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_ULE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_LE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUNE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_NE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UNE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UNE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_NE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUEQ:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_EQ;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UEQ;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UEQ;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_EQ;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOGT:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OGT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OGT;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOGE:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OGE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OGE;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOLT:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OLT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OLT;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOLE:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OLE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OLE;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETONE:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_ONE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_ONE;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOEQ:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OEQ;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OEQ;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+  };
+}
+
+SDValue
+AMDILTargetLowering::LowerMemArgument(
+    SDValue Chain,
+    CallingConv::ID CallConv,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl, SelectionDAG &DAG,
+    const CCValAssign &VA,
+    MachineFrameInfo *MFI,
+    unsigned i) const
+{
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+  bool AlwaysUseMutable = (CallConv==CallingConv::Fast) &&
+    getTargetMachine().Options.GuaranteedTailCallOpt;
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can
+  // be changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+      VA.getLocMemOffset(), isImmutable);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+  if (Flags.isByVal())
+    return FIN;
+  return DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+      MachinePointerInfo::getFixedStack(FI),
+      false, false, false, 0);
+}
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation Help Functions End
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Class Implementation Begins
+//===----------------------------------------------------------------------===//
+  AMDILTargetLowering::AMDILTargetLowering(TargetMachine &TM)
+: TargetLowering(TM, new TargetLoweringObjectFileELF())
+{
+  int types[] =
+  {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::f32,
+    (int)MVT::f64,
+    (int)MVT::i64,
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+
+  int IntTypes[] =
+  {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::i64
+  };
+
+  int FloatTypes[] =
+  {
+    (int)MVT::f32,
+    (int)MVT::f64
+  };
+
+  int VectorTypes[] =
+  {
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+  size_t numTypes = sizeof(types) / sizeof(*types);
+  size_t numFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
+  size_t numIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
+  size_t numVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
+
+  const AMDILSubtarget &STM = getTargetMachine().getSubtarget<AMDILSubtarget>();
+  // These are the current register classes that are
+  // supported
+
+  for (unsigned int x  = 0; x < numTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
+
+    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
+    // We cannot sextinreg, expand to shifts
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::BRCOND, VT, Custom);
+    setOperationAction(ISD::BR_CC, VT, Custom);
+    setOperationAction(ISD::BR_JT, VT, Expand);
+    setOperationAction(ISD::BRIND, VT, Expand);
+    // TODO: Implement custom UREM/SREM routines
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::GlobalAddress, VT, Custom);
+    setOperationAction(ISD::JumpTable, VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    if (VT != MVT::i64 && VT != MVT::v2i64) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+    }
+  }
+  for (unsigned int x = 0; x < numFloatTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
+
+    // IL does not have these operations for floating point types
+    setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
+    setOperationAction(ISD::SETOLT, VT, Expand);
+    setOperationAction(ISD::SETOGE, VT, Expand);
+    setOperationAction(ISD::SETOGT, VT, Expand);
+    setOperationAction(ISD::SETOLE, VT, Expand);
+    setOperationAction(ISD::SETULT, VT, Expand);
+    setOperationAction(ISD::SETUGE, VT, Expand);
+    setOperationAction(ISD::SETUGT, VT, Expand);
+    setOperationAction(ISD::SETULE, VT, Expand);
+  }
+
+  for (unsigned int x = 0; x < numIntTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
+
+    // GPU also does not have divrem function for signed or unsigned
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    // GPU doesn't have a rotl, rotr, or byteswap instruction
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+
+    // GPU doesn't have any counting operators
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  for ( unsigned int ii = 0; ii < numVectorTypes; ++ii )
+  {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
+
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    // setOperationAction(ISD::VSETCC, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+
+  }
+  if (STM.device()->isSupported(AMDILDeviceInfo::LongOps)) {
+    setOperationAction(ISD::MULHU, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
+    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
+    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+  }
+  if (STM.device()->isSupported(AMDILDeviceInfo::DoubleOps)) {
+    // we support loading/storing v2f64 but not operations on the type
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
+    // We want to expand vector conversions into their scalar
+    // counterparts.
+    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+  }
+  // TODO: Fix the UDIV24 algorithm so it works for these
+  // types correctly. This needs vector comparisons
+  // for this to work correctly.
+  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+  setOperationAction(ISD::SUBC, MVT::Other, Expand);
+  setOperationAction(ISD::ADDE, MVT::Other, Expand);
+  setOperationAction(ISD::ADDC, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC, MVT::Other, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::SETCC, MVT::Other, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+  setOperationAction(ISD::BUILD_VECTOR, MVT::Other, Custom);
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
+  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
+  setOperationAction(ISD::TRAP              , MVT::Other  , Legal);
+
+  setStackPointerRegisterToSaveRestore(AMDGPU::SP);
+  setSchedulingPreference(Sched::RegPressure);
+  setPow2DivIsCheap(false);
+  setPrefLoopAlignment(16);
+  setSelectIsExpensive(true);
+  setJumpIsExpensive(true);
+
+  maxStoresPerMemcpy  = 4096;
+  maxStoresPerMemmove = 4096;
+  maxStoresPerMemset  = 4096;
+
+#undef numTypes
+#undef numIntTypes
+#undef numVectorTypes
+#undef numFloatTypes
+}
+
+const char *
+AMDILTargetLowering::getTargetNodeName(unsigned Opcode) const
+{
+  switch (Opcode) {
+    default: return 0;
+    case AMDILISD::CMOVLOG:  return "AMDILISD::CMOVLOG";
+    case AMDILISD::MAD:  return "AMDILISD::MAD";
+    case AMDILISD::CALL:  return "AMDILISD::CALL";
+    case AMDILISD::SELECT_CC: return "AMDILISD::SELECT_CC";
+    case AMDILISD::UMUL: return "AMDILISD::UMUL";
+    case AMDILISD::DIV_INF: return "AMDILISD::DIV_INF";
+    case AMDILISD::VBUILD: return "AMDILISD::VBUILD";
+    case AMDILISD::CMP: return "AMDILISD::CMP";
+    case AMDILISD::IL_CC_I_LT: return "AMDILISD::IL_CC_I_LT";
+    case AMDILISD::IL_CC_I_LE: return "AMDILISD::IL_CC_I_LE";
+    case AMDILISD::IL_CC_I_GT: return "AMDILISD::IL_CC_I_GT";
+    case AMDILISD::IL_CC_I_GE: return "AMDILISD::IL_CC_I_GE";
+    case AMDILISD::IL_CC_I_EQ: return "AMDILISD::IL_CC_I_EQ";
+    case AMDILISD::IL_CC_I_NE: return "AMDILISD::IL_CC_I_NE";
+    case AMDILISD::RET_FLAG: return "AMDILISD::RET_FLAG";
+    case AMDILISD::BRANCH_COND: return "AMDILISD::BRANCH_COND";
+
+  };
+}
+bool
+AMDILTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+    const CallInst &I, unsigned Intrinsic) const
+{
+  return false;
+}
+
+// The backend supports 32 and 64 bit floating point immediates
+bool
+AMDILTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const
+{
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool
+AMDILTargetLowering::ShouldShrinkFPConstant(EVT VT) const
+{
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+
+// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+// be zero. Op is expected to be a target specific node. Used by DAG
+// combiner.
+
+void
+AMDILTargetLowering::computeMaskedBitsForTargetNode(
+    const SDValue Op,
+    APInt &KnownZero,
+    APInt &KnownOne,
+    const SelectionDAG &DAG,
+    unsigned Depth) const
+{
+  APInt KnownZero2;
+  APInt KnownOne2;
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
+  switch (Op.getOpcode()) {
+    default: break;
+    case AMDILISD::SELECT_CC:
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(1),
+                 KnownZero,
+                 KnownOne,
+                 Depth + 1
+                 );
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(0),
+                 KnownZero2,
+                 KnownOne2
+                 );
+             assert((KnownZero & KnownOne) == 0
+                 && "Bits known to be one AND zero?");
+             assert((KnownZero2 & KnownOne2) == 0
+                 && "Bits known to be one AND zero?");
+             // Only known if known in both the LHS and RHS
+             KnownOne &= KnownOne2;
+             KnownZero &= KnownZero2;
+             break;
+  };
+}
+
+// This is the function that determines which calling convention should
+// be used. Currently there is only one calling convention
+CCAssignFn*
+AMDILTargetLowering::CCAssignFnForNode(unsigned int Op) const
+{
+  //uint64_t CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  return CC_AMDIL32;
+}
+
+// LowerCallResult - Lower the result values of an ISD::CALL into the
+// appropriate copies out of appropriate physical registers.  This assumes that
+// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+// being lowered.  The returns a SDNode with the same number of values as the
+// ISD::CALL.
+SDValue
+AMDILTargetLowering::LowerCallResult(
+    SDValue Chain,
+    SDValue InFlag,
+    CallingConv::ID CallConv,
+    bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl,
+    SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const
+{
+  // Assign locations to each value returned by this call
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_AMDIL32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    EVT CopyVT = RVLocs[i].getValVT();
+    if (RVLocs[i].isRegLoc()) {
+      Chain = DAG.getCopyFromReg(
+          Chain,
+          dl,
+          RVLocs[i].getLocReg(),
+          CopyVT,
+          InFlag
+          ).getValue(1);
+      SDValue Val = Chain.getValue(0);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Val);
+    }
+  }
+
+  return Chain;
+
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+// Recursively assign SDNodeOrdering to any unordered nodes
+// This is necessary to maintain source ordering of instructions
+// under -O0 to avoid odd-looking "skipping around" issues.
+  static const SDValue
+Ordered( SelectionDAG &DAG, unsigned order, const SDValue New )
+{
+  if (order != 0 && DAG.GetOrdering( New.getNode() ) == 0) {
+    DAG.AssignOrdering( New.getNode(), order );
+    for (unsigned i = 0, e = New.getNumOperands(); i < e; ++i)
+      Ordered( DAG, order, New.getOperand(i) );
+  }
+  return New;
+}
+
+#define LOWER(A) \
+  case ISD:: A: \
+return Ordered( DAG, DAG.GetOrdering( Op.getNode() ), Lower##A(Op, DAG) )
+
+SDValue
+AMDILTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+  switch (Op.getOpcode()) {
+    default:
+      Op.getNode()->dump();
+      assert(0 && "Custom lowering code for this"
+          "instruction is not implemented yet!");
+      break;
+      LOWER(GlobalAddress);
+      LOWER(JumpTable);
+      LOWER(ConstantPool);
+      LOWER(ExternalSymbol);
+      LOWER(SDIV);
+      LOWER(SREM);
+      LOWER(BUILD_VECTOR);
+      LOWER(SELECT);
+      LOWER(SETCC);
+      LOWER(SIGN_EXTEND_INREG);
+      LOWER(DYNAMIC_STACKALLOC);
+      LOWER(BRCOND);
+      LOWER(BR_CC);
+  }
+  return Op;
+}
+
+#undef LOWER
+
+SDValue
+AMDILTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue DST = Op;
+  const GlobalAddressSDNode *GADN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *G = GADN->getGlobal();
+  DebugLoc DL = Op.getDebugLoc();
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  if (!GV) {
+    DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+  } else {
+    if (GV->hasInitializer()) {
+      const Constant *C = dyn_cast<Constant>(GV->getInitializer());
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+        DST = DAG.getConstant(CI->getValue(), Op.getValueType());
+      } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(C)) {
+        DST = DAG.getConstantFP(CF->getValueAPF(),
+            Op.getValueType());
+      } else if (dyn_cast<ConstantAggregateZero>(C)) {
+        EVT VT = Op.getValueType();
+        if (VT.isInteger()) {
+          DST = DAG.getConstant(0, VT);
+        } else {
+          DST = DAG.getConstantFP(0, VT);
+        }
+      } else {
+        assert(!"lowering this type of Global Address "
+            "not implemented yet!");
+        C->dump();
+        DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+      }
+    } else {
+      DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+    }
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+{
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+  return Result;
+}
+SDValue
+AMDILTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = Op.getValueType();
+  SDValue Result;
+  if (CP->isMachineConstantPoolEntry()) {
+    Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+        CP->getAlignment(), CP->getOffset(), CP->getTargetFlags());
+  } else {
+    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+        CP->getAlignment(), CP->getOffset(), CP->getTargetFlags());
+  }
+  return Result;
+}
+
+SDValue
+AMDILTargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const
+{
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, MVT::i32);
+  return Result;
+}
+
+/// LowerFORMAL_ARGUMENTS - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: isVarArg, hasStructRet, isMemReg
+  SDValue
+AMDILTargetLowering::LowerFormalArguments(SDValue Chain,
+    CallingConv::ID CallConv,
+    bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl,
+    SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals)
+const
+{
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  //const Function *Fn = MF.getFunction();
+  //MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CallingConv::ID CC = MF.getFunction()->getCallingConv();
+  //bool hasStructRet = MF.getFunction()->hasStructRetAttr();
+
+  CCState CCInfo(CC, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // When more calling conventions are added, they need to be chosen here
+  CCInfo.AnalyzeFormalArguments(Ins, CC_AMDIL32);
+  SDValue StackPtr;
+
+  //unsigned int FirstStackArgLoc = 0;
+
+  for (unsigned int i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC = getRegClassFor(
+          RegVT.getSimpleVT().SimpleTy);
+
+      unsigned int Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(
+          Chain,
+          dl,
+          Reg,
+          RegVT);
+      // If this is an 8 or 16-bit value, it is really passed
+      // promoted to 32 bits.  Insert an assert[sz]ext to capture
+      // this, then truncate to the right size.
+
+      if (VA.getLocInfo() == CCValAssign::SExt) {
+        ArgValue = DAG.getNode(
+            ISD::AssertSext,
+            dl,
+            RegVT,
+            ArgValue,
+            DAG.getValueType(VA.getValVT()));
+      } else if (VA.getLocInfo() == CCValAssign::ZExt) {
+        ArgValue = DAG.getNode(
+            ISD::AssertZext,
+            dl,
+            RegVT,
+            ArgValue,
+            DAG.getValueType(VA.getValVT()));
+      }
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        ArgValue = DAG.getNode(
+            ISD::TRUNCATE,
+            dl,
+            VA.getValVT(),
+            ArgValue);
+      }
+      // Add the value to the list of arguments
+      // to be passed in registers
+      InVals.push_back(ArgValue);
+      if (isVarArg) {
+        assert(0 && "Variable arguments are not yet supported");
+        // See MipsISelLowering.cpp for ideas on how to implement
+      }
+    } else if(VA.isMemLoc()) {
+      InVals.push_back(LowerMemArgument(Chain, CallConv, Ins,
+            dl, DAG, VA, MFI, i));
+    } else {
+      assert(0 && "found a Value Assign that is "
+          "neither a register or a memory location");
+    }
+  }
+  /*if (hasStructRet) {
+    assert(0 && "Has struct return is not yet implemented");
+  // See MipsISelLowering.cpp for ideas on how to implement
+  }*/
+
+  if (isVarArg) {
+    assert(0 && "Variable arguments are not yet supported");
+    // See X86/PPC/CellSPU ISelLowering.cpp for ideas on how to implement
+  }
+  // This needs to be changed to non-zero if the return function needs
+  // to pop bytes
+  return Chain;
+}
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+    ISD::ArgFlagsTy Flags, SelectionDAG &DAG) {
+  assert(0 && "MemCopy does not exist yet");
+  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+
+  return DAG.getMemcpy(Chain,
+      Src.getDebugLoc(),
+      Dst, Src, SizeNode, Flags.getByValAlign(),
+      /*IsVol=*/false, /*AlwaysInline=*/true, 
+      MachinePointerInfo(), MachinePointerInfo());
+}
+
+SDValue
+AMDILTargetLowering::LowerMemOpCallTo(SDValue Chain,
+    SDValue StackPtr, SDValue Arg,
+    DebugLoc dl, SelectionDAG &DAG,
+    const CCValAssign &VA,
+    ISD::ArgFlagsTy Flags) const
+{
+  unsigned int LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD,
+      dl,
+      getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal()) {
+    PtrOff = CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
+  } else {
+    PtrOff = DAG.getStore(Chain, dl, Arg, PtrOff,
+        MachinePointerInfo::getStack(LocMemOffset),
+        false, false, 0);
+  }
+  return PtrOff;
+}
+/// LowerCAL - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isVarArg, isTailCall, hasStructRet
+SDValue
+AMDILTargetLowering::LowerCall(CallLoweringInfo &CLI,
+    SmallVectorImpl<SDValue> &InVals) const
+
+#if 0
+    SDValue Chain, SDValue Callee,
+    CallingConv::ID CallConv, bool isVarArg, bool doesNotRet,
+    bool& isTailCall,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl, SelectionDAG &DAG,
+#endif
+{
+  CLI.IsTailCall = false;
+  MachineFunction& MF = CLI.DAG.getMachineFunction();
+  // FIXME: DO we need to handle fast calling conventions and tail call
+  // optimizations?? X86/PPC ISelLowering
+  /*bool hasStructRet = (TheCall->getNumArgs())
+    ? TheCall->getArgFlags(0).device()->isSRet()
+    : false;*/
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Analyze operands of the call, assigning locations to each operand
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, CLI.DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *CLI.DAG.getContext());
+  // Analyize the calling operands, but need to change
+  // if we have more than one calling convetion
+  CCInfo.AnalyzeCallOperands(CLI.Outs, CCAssignFnForNode(CLI.CallConv));
+
+  unsigned int NumBytes = CCInfo.getNextStackOffset();
+  if (CLI.IsTailCall) {
+    assert(CLI.IsTailCall && "Tail Call not handled yet!");
+    // See X86/PPC ISelLowering
+  }
+
+  CLI.Chain = CLI.DAG.getCALLSEQ_START(CLI.Chain,
+                                   CLI.DAG.getIntPtrConstant(NumBytes, true));
+
+  SmallVector<std::pair<unsigned int, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+  //unsigned int FirstStacArgLoc = 0;
+  //int LastArgStackLoc = 0;
+
+  // Walk the register/memloc assignments, insert copies/loads
+  for (unsigned int i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    //bool isByVal = Flags.isByVal(); // handle byval/bypointer registers
+    // Arguments start after the 5 first operands of ISD::CALL
+    SDValue Arg = CLI.OutVals[i];
+    //Promote the value if needed
+    switch(VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full:
+               break;
+      case CCValAssign::SExt:
+               Arg = CLI.DAG.getNode(ISD::SIGN_EXTEND,
+                   CLI.DL,
+                   VA.getLocVT(), Arg);
+               break;
+      case CCValAssign::ZExt:
+               Arg = CLI.DAG.getNode(ISD::ZERO_EXTEND,
+                   CLI.DL,
+                   VA.getLocVT(), Arg);
+               break;
+      case CCValAssign::AExt:
+               Arg = CLI.DAG.getNode(ISD::ANY_EXTEND,
+                   CLI.DL,
+                   VA.getLocVT(), Arg);
+               break;
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (VA.isMemLoc()) {
+      // Create the frame index object for this incoming parameter
+      int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+          VA.getLocMemOffset(), true);
+      SDValue PtrOff = CLI.DAG.getFrameIndex(FI,getPointerTy());
+
+      // emit ISD::STORE whichs stores the
+      // parameter value to a stack Location
+      MemOpChains.push_back(CLI.DAG.getStore(CLI.Chain, CLI.DL, Arg, PtrOff,
+            MachinePointerInfo::getFixedStack(FI),
+            false, false, 0));
+    } else {
+      assert(0 && "Not a Reg/Mem Loc, major error!");
+    }
+  }
+  if (!MemOpChains.empty()) {
+    CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor,
+        CLI.DL,
+        MVT::Other,
+        &MemOpChains[0],
+        MemOpChains.size());
+  }
+  SDValue InFlag;
+  if (!CLI.IsTailCall) {
+    for (unsigned int i = 0, e = RegsToPass.size(); i != e; ++i) {
+      CLI.Chain = CLI.DAG.getCopyToReg(CLI.Chain,
+          CLI.DL,
+          RegsToPass[i].first,
+          RegsToPass[i].second,
+          InFlag);
+      InFlag = CLI.Chain.getValue(1);
+    }
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+  // every direct call is) turn it into a TargetGlobalAddress/
+  // TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee))  {
+    CLI.Callee = CLI.DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, getPointerTy());
+  }
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+    CLI.Callee = CLI.DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+  }
+  else if (CLI.IsTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1708
+  }
+
+  SDVTList NodeTys = CLI.DAG.getVTList(MVT::Other, MVTGLUE);
+  SmallVector<SDValue, 8> Ops;
+
+  if (CLI.IsTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1721
+  }
+  // If this is a direct call, pass the chain and the callee
+  if (CLI.Callee.getNode()) {
+    Ops.push_back(CLI.Chain);
+    Ops.push_back(CLI.Callee);
+  }
+
+  if (CLI.IsTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1739
+  }
+
+  // Add argument registers to the end of the list so that they are known
+  // live into the call
+  for (unsigned int i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(CLI.DAG.getRegister(
+          RegsToPass[i].first,
+          RegsToPass[i].second.getValueType()));
+  }
+  if (InFlag.getNode()) {
+    Ops.push_back(InFlag);
+  }
+
+  // Emit Tail Call
+  if (CLI.IsTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1762
+  }
+
+  CLI.Chain = CLI.DAG.getNode(AMDILISD::CALL,
+      CLI.DL,
+      NodeTys, &Ops[0], Ops.size());
+  InFlag = CLI.Chain.getValue(1);
+
+  // Create the CALLSEQ_END node
+  CLI.Chain = CLI.DAG.getCALLSEQ_END(
+      CLI.Chain,
+      CLI.DAG.getIntPtrConstant(NumBytes, true),
+      CLI.DAG.getIntPtrConstant(0, true),
+      InFlag);
+  InFlag = CLI.Chain.getValue(1);
+  // Handle result values, copying them out of physregs into vregs that
+  // we return
+  return LowerCallResult(CLI.Chain, InFlag, CLI.CallConv, CLI.IsVarArg, CLI.Ins, CLI.DL, CLI.DAG,
+      InVals);
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSDIV64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSDIV32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16
+      || OVT.getScalarType() == MVT::i8) {
+    DST = LowerSDIV24(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSREM64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSREM32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16) {
+    DST = LowerSREM16(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i8) {
+    DST = LowerSREM8(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerBUILD_VECTOR( SDValue Op, SelectionDAG &DAG ) const
+{
+  EVT VT = Op.getValueType();
+  SDValue Nodes1;
+  SDValue second;
+  SDValue third;
+  SDValue fourth;
+  DebugLoc DL = Op.getDebugLoc();
+  Nodes1 = DAG.getNode(AMDILISD::VBUILD,
+      DL,
+      VT, Op.getOperand(0));
+#if 0
+  bool allEqual = true;
+  for (unsigned x = 1, y = Op.getNumOperands(); x < y; ++x) {
+    if (Op.getOperand(0) != Op.getOperand(x)) {
+      allEqual = false;
+      break;
+    }
+  }
+  if (allEqual) {
+    return Nodes1;
+  }
+#endif
+  switch(Op.getNumOperands()) {
+    default:
+    case 1:
+      break;
+    case 4:
+      fourth = Op.getOperand(3);
+      if (fourth.getOpcode() != ISD::UNDEF) {
+        Nodes1 = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT,
+            DL,
+            Op.getValueType(),
+            Nodes1,
+            fourth,
+            DAG.getConstant(7, MVT::i32));
+      }
+    case 3:
+      third = Op.getOperand(2);
+      if (third.getOpcode() != ISD::UNDEF) {
+        Nodes1 = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT,
+            DL,
+            Op.getValueType(),
+            Nodes1,
+            third,
+            DAG.getConstant(6, MVT::i32));
+      }
+    case 2:
+      second = Op.getOperand(1);
+      if (second.getOpcode() != ISD::UNDEF) {
+        Nodes1 = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT,
+            DL,
+            Op.getValueType(),
+            Nodes1,
+            second,
+            DAG.getConstant(5, MVT::i32));
+      }
+      break;
+  };
+  return Nodes1;
+}
+
+SDValue
+AMDILTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  Cond = getConversionNode(DAG, Cond, Op, true);
+  Cond = DAG.getNode(AMDILISD::CMOVLOG,
+      DL,
+      Op.getValueType(), Cond, LHS, RHS);
+  return Cond;
+}
+SDValue
+AMDILTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Cond;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue CC  = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  unsigned int AMDILCC = CondCCodeToCC(
+      SetCCOpcode,
+      LHS.getValueType().getSimpleVT().SimpleTy);
+  assert((AMDILCC != AMDILCC::COND_ERROR) && "Invalid SetCC!");
+  Cond = DAG.getNode(
+      ISD::SELECT_CC,
+      Op.getDebugLoc(),
+      LHS.getValueType(),
+      LHS, RHS,
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      CC);
+  Cond = getConversionNode(DAG, Cond, Op, true);
+  Cond = DAG.getNode(
+      ISD::AND,
+      DL,
+      Cond.getValueType(),
+      DAG.getConstant(1, Cond.getValueType()),
+      Cond);
+  return Cond;
+}
+
+SDValue
+AMDILTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Data = Op.getOperand(0);
+  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
+  DebugLoc DL = Op.getDebugLoc();
+  EVT DVT = Data.getValueType();
+  EVT BVT = BaseType->getVT();
+  unsigned baseBits = BVT.getScalarType().getSizeInBits();
+  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
+  unsigned shiftBits = srcBits - baseBits;
+  if (srcBits < 32) {
+    // If the op is less than 32 bits, then it needs to extend to 32bits
+    // so it can properly keep the upper bits valid.
+    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
+    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
+    shiftBits = 32 - baseBits;
+    DVT = IVT;
+  }
+  SDValue Shift = DAG.getConstant(shiftBits, DVT);
+  // Shift left by 'Shift' bits.
+  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
+  // Signed shift Right by 'Shift' bits.
+  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
+  if (srcBits < 32) {
+    // Once the sign extension is done, the op needs to be converted to
+    // its original type.
+    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
+  }
+  return Data;
+}
+EVT
+AMDILTargetLowering::genIntType(uint32_t size, uint32_t numEle) const
+{
+  int iSize = (size * numEle);
+  int vEle = (iSize >> ((size == 64) ? 6 : 5));
+  if (!vEle) {
+    vEle = 1;
+  }
+  if (size == 64) {
+    if (vEle == 1) {
+      return EVT(MVT::i64);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i64, vEle));
+    }
+  } else {
+    if (vEle == 1) {
+      return EVT(MVT::i32);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i32, vEle));
+    }
+  }
+}
+
+SDValue
+AMDILTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  unsigned int SPReg = AMDGPU::SP;
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue SP = DAG.getCopyFromReg(Chain,
+      DL,
+      SPReg, MVT::i32);
+  SDValue NewSP = DAG.getNode(ISD::ADD,
+      DL,
+      MVT::i32, SP, Size);
+  Chain = DAG.getCopyToReg(SP.getValue(1),
+      DL,
+      SPReg, NewSP);
+  SDValue Ops[2] = {NewSP, Chain};
+  Chain = DAG.getMergeValues(Ops, 2 ,DL);
+  return Chain;
+}
+SDValue
+AMDILTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+  SDValue Result;
+  Result = DAG.getNode(
+      AMDILISD::BRANCH_COND,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Chain, Jump, Cond);
+  return Result;
+}
+
+SDValue
+AMDILTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue CC = Op.getOperand(1);
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue JumpT  = Op.getOperand(4);
+  SDValue CmpValue;
+  SDValue Result;
+  CmpValue = DAG.getNode(
+      ISD::SELECT_CC,
+      Op.getDebugLoc(),
+      LHS.getValueType(),
+      LHS, RHS,
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      CC);
+  Result = DAG.getNode(
+      AMDILISD::BRANCH_COND,
+      CmpValue.getDebugLoc(),
+      MVT::Other, Chain,
+      JumpT, CmpValue);
+  return Result;
+}
+
+// LowerRET - Lower an ISD::RET node.
+SDValue
+AMDILTargetLowering::LowerReturn(SDValue Chain,
+    CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    DebugLoc dl, SelectionDAG &DAG)
+const
+{
+  //MachineFunction& MF = DAG.getMachineFunction();
+  // CCValAssign - represent the assignment of the return value
+  // to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values of ISD::RET
+  CCInfo.AnalyzeReturn(Outs, RetCC_AMDIL32);
+  // If this is the first return lowered for this function, add
+  // the regs to the liveout set for the function
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  for (unsigned int i = 0, e = RVLocs.size(); i != e; ++i) {
+    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) {
+      MRI.addLiveOut(RVLocs[i].getLocReg());
+    }
+  }
+  // FIXME: implement this when tail call is implemented
+  // Chain = GetPossiblePreceedingTailCall(Chain, AMDILISD::TAILCALL);
+  // both x86 and ppc implement this in ISelLowering
+
+  // Regular return here
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain);
+  RetOps.push_back(DAG.getConstant(0/*getBytesToPopOnReturn()*/, MVT::i32));
+  for (unsigned int i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    SDValue ValToCopy = OutVals[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    // ISD::Ret => ret chain, (regnum1, val1), ...
+    // So i * 2 + 1 index only the regnums
+    Chain = DAG.getCopyToReg(Chain,
+        dl,
+        VA.getLocReg(),
+        ValToCopy,
+        Flag);
+    // guarantee that all emitted copies are stuck together
+    // avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+  /*if (MF.getFunction()->hasStructRetAttr()) {
+    assert(0 && "Struct returns are not yet implemented!");
+  // Both MIPS and X86 have this
+  }*/
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  Flag = DAG.getNode(AMDILISD::RET_FLAG,
+      dl,
+      MVT::Other, &RetOps[0], RetOps.size());
+  return Flag;
+}
+
+unsigned int
+AMDILTargetLowering::getFunctionAlignment(const Function *) const
+{
+  return 0;
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(AMDILISD::DIV_INF, DL, FLTTY, fa, fb);
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(AMDILISD::MAD, DL, FLTTY, fqneg, fb, fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, cv, jq, 
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSelectCC(DL,
+      r0, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSelectCC(DL,
+      r1, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i8) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i8) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i16) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i16) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getNode(AMDILISD::CMP, DL, OVT,
+      DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+      r0, DAG.getConstant(0, OVT));
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getNode(AMDILISD::CMP, DL, OVT, 
+      DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+      r1, DAG.getConstant(0, OVT));
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
diff --git a/lib/Target/AMDGPU/AMDILISelLowering.h b/lib/Target/AMDGPU/AMDILISelLowering.h
new file mode 100644 (file)
index 0000000..817aaf5
--- /dev/null
@@ -0,0 +1,203 @@
+//===-- AMDILISelLowering.h - AMDIL DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AMDIL uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDIL_ISELLOWERING_H_
+#define AMDIL_ISELLOWERING_H_
+#include "AMDIL.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm
+{
+  namespace AMDILISD
+  {
+    enum
+    {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CMOVLOG,     // 32bit FP Conditional move logical instruction
+      MAD,         // 32bit Fused Multiply Add instruction
+      VBUILD,      // scalar to vector mov instruction
+      CALL,        // Function call based on a single integer
+      SELECT_CC,   // Select the correct conditional instruction
+      UMUL,        // 32bit unsigned multiplication
+      DIV_INF,      // Divide with infinity returned on zero divisor
+      CMP,
+      IL_CC_I_GT,
+      IL_CC_I_LT,
+      IL_CC_I_GE,
+      IL_CC_I_LE,
+      IL_CC_I_EQ,
+      IL_CC_I_NE,
+      RET_FLAG,
+      BRANCH_COND,
+      LAST_ISD_NUMBER
+    };
+  } // AMDILISD
+
+  class MachineBasicBlock;
+  class MachineInstr;
+  class DebugLoc;
+  class TargetInstrInfo;
+
+  class AMDILTargetLowering : public TargetLowering
+  {
+    public:
+      AMDILTargetLowering(TargetMachine &TM);
+
+      virtual SDValue
+        LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+      /// computeMaskedBitsForTargetNode - Determine which of
+      /// the bits specified
+      /// in Mask are known to be either zero or one and return them in
+      /// the
+      /// KnownZero/KnownOne bitsets.
+      virtual void
+        computeMaskedBitsForTargetNode(
+            const SDValue Op,
+            APInt &KnownZero,
+            APInt &KnownOne,
+            const SelectionDAG &DAG,
+            unsigned Depth = 0
+            ) const;
+
+      virtual bool 
+        getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                  const CallInst &I, unsigned Intrinsic) const;
+      virtual const char*
+        getTargetNodeName(
+            unsigned Opcode
+            ) const;
+      // We want to mark f32/f64 floating point values as
+      // legal
+      bool
+        isFPImmLegal(const APFloat &Imm, EVT VT) const;
+      // We don't want to shrink f64/f32 constants because
+      // they both take up the same amount of space and
+      // we don't want to use a f2d instruction.
+      bool ShouldShrinkFPConstant(EVT VT) const;
+
+      /// getFunctionAlignment - Return the Log2 alignment of this
+      /// function.
+      virtual unsigned int
+        getFunctionAlignment(const Function *F) const;
+
+    private:
+      CCAssignFn*
+        CCAssignFnForNode(unsigned int CC) const;
+
+      SDValue LowerCallResult(SDValue Chain,
+          SDValue InFlag,
+          CallingConv::ID CallConv,
+          bool isVarArg,
+          const SmallVectorImpl<ISD::InputArg> &Ins,
+          DebugLoc dl,
+          SelectionDAG &DAG,
+          SmallVectorImpl<SDValue> &InVals) const;
+
+      SDValue LowerMemArgument(SDValue Chain,
+          CallingConv::ID CallConv,
+          const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+          DebugLoc dl, SelectionDAG &DAG,
+          const CCValAssign &VA,  MachineFrameInfo *MFI,
+          unsigned i) const;
+
+      SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+          SDValue Arg,
+          DebugLoc dl, SelectionDAG &DAG,
+          const CCValAssign &VA,
+          ISD::ArgFlagsTy Flags) const;
+
+      virtual SDValue
+        LowerFormalArguments(SDValue Chain,
+            CallingConv::ID CallConv, bool isVarArg,
+            const SmallVectorImpl<ISD::InputArg> &Ins,
+            DebugLoc dl, SelectionDAG &DAG,
+            SmallVectorImpl<SDValue> &InVals) const;
+
+      virtual SDValue
+        LowerCall(CallLoweringInfo &CLI,
+        SmallVectorImpl<SDValue> &InVals) const;
+
+      virtual SDValue
+        LowerReturn(SDValue Chain,
+            CallingConv::ID CallConv, bool isVarArg,
+            const SmallVectorImpl<ISD::OutputArg> &Outs,
+            const SmallVectorImpl<SDValue> &OutVals,
+            DebugLoc dl, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSDI