X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrInfo.cpp;h=cc38c393b13ed48427b50cfefda009f64a096fd8;hb=c848b1bbcf88ab5d8318d990612fb1fda206ea3d;hp=6dcac891f09a5bb7dd97517fcce24b19f6aea4f5;hpb=510fb362a815499dde40cfae807b2ab927527ab0;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 6dcac891f09..cc38c393b13 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" @@ -35,11 +36,13 @@ #include "llvm/Target/TargetOptions.h" #include -#define GET_INSTRINFO_CTOR -#include "X86GenInstrInfo.inc" - using namespace llvm; +#define DEBUG_TYPE "x86-instr-info" + +#define GET_INSTRINFO_CTOR_DTOR +#include "X86GenInstrInfo.inc" + static cl::opt NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions")); @@ -91,6 +94,9 @@ struct X86OpTblEntry { uint16_t Flags; }; +// Pin the vtable to this file. +void X86InstrInfo::anchor() {} + X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) : X86GenInstrInfo((tm.getSubtarget().is64Bit() ? X86::ADJCALLSTACKDOWN64 @@ -441,7 +447,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 }, { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, @@ -502,7 +507,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, @@ -603,6 +607,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVDQA64rr, X86::VMOVDQA64rm, TB_ALIGN_64 }, { X86::VMOVDQU32rr, X86::VMOVDQU32rm, 0 }, { X86::VMOVDQU64rr, X86::VMOVDQU64rm, 0 }, + { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, + { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, @@ -1208,8 +1214,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::PEXT64rr, X86::PEXT64rm, 0 }, // AVX-512 foldable instructions - { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, - { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, @@ -1222,17 +1226,31 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, + { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, + { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, + { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, + { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, + { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, + { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 }, + { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 }, + { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 }, + { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, + { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, + { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, + { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, + { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, + { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, @@ -1266,119 +1284,111 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) static const X86OpTblEntry OpTbl3[] = { // FMA foldable instructions - { X86::VFMADDSSr231r, X86::VFMADDSSr231m, 0 }, - { X86::VFMADDSDr231r, X86::VFMADDSDr231m, 0 }, - { X86::VFMADDSSr132r, X86::VFMADDSSr132m, 0 }, - { X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 }, - { X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 }, - { X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 }, - { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, 0 }, - { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, 0 }, - - { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 }, - { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 }, - { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_16 }, - { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_16 }, - { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_16 }, - { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_16 }, - { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_32 }, - { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_32 }, - { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_32 }, - { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 }, - { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 }, - { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 }, - - { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 }, - { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 }, - { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, 0 }, - { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 }, - { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 }, - { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 }, - { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, 0 }, - { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, 0 }, - - { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 }, - { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 }, - { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_16 }, - { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_16 }, - { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_16 }, - { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_16 }, - { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_32 }, - { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_32 }, - { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_32 }, - { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 }, - { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 }, - { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 }, - - { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 }, - { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 }, - { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, 0 }, - { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 }, - { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 }, - { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 }, - { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, 0 }, - { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, 0 }, - - { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 }, - { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 }, - { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_16 }, - { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_16 }, - { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_16 }, - { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_16 }, - { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_32 }, - { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_32 }, - { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_32 }, - { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 }, - { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 }, - { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 }, - - { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 }, - { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 }, - { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, 0 }, - { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 }, - { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 }, - { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 }, - { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, 0 }, - { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, 0 }, - - { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 }, - { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 }, - { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_16 }, - { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_16 }, - { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_16 }, - { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_16 }, - { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_32 }, - { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_32 }, - { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_32 }, - { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 }, - { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 }, - { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 }, - - { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 }, - { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 }, - { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_16 }, - { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_16 }, - { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_16 }, - { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_16 }, - { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_32 }, - { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_32 }, - { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_32 }, - { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 }, - { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 }, - { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 }, - - { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 }, - { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 }, - { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_16 }, - { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_16 }, - { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_16 }, - { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_16 }, - { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_32 }, - { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_32 }, - { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_32 }, - { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 }, - { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 }, - { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, + { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + + { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, + { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, + { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE }, + { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE }, + { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE }, + { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE }, + { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE }, + { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE }, + { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE }, + { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, + + { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + + { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE }, + { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE }, + { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE }, + { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, + + { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + + { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE }, + { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, + + { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + + { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE }, + + { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE }, + + { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE }, // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 }, @@ -1418,6 +1428,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, + { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, + { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, + { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 } }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { @@ -1534,8 +1548,8 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::VMOVDQA32rm: - case X86::VMOVDQA64rm: + case X86::VMOVAPSZrm: + case X86::VMOVUPSZrm: return true; } } @@ -1561,6 +1575,8 @@ static bool isFrameStoreOpcode(int Opcode) { case X86::VMOVAPSYmr: case X86::VMOVAPDYmr: case X86::VMOVDQAYmr: + case X86::VMOVUPSZmr: + case X86::VMOVAPSZmr: case X86::MMX_MOVD64mr: case X86::MMX_MOVQ64mr: case X86::MMX_MOVNTQmr: @@ -1619,9 +1635,9 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) return false; bool isPICBase = false; - for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg), - E = MRI.def_end(); I != E; ++I) { - MachineInstr *DefMI = I.getOperand().getParent(); + for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), + E = MRI.def_instr_end(); I != E; ++I) { + MachineInstr *DefMI = &*I; if (DefMI->getOpcode() != X86::MOVPC32r) return false; assert(!isPICBase && "More than one PIC base?"); @@ -1807,7 +1823,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MBB.insert(I, MI); } - MachineInstr *NewMI = prior(I); + MachineInstr *NewMI = std::prev(I); NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); } @@ -1984,7 +2000,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); unsigned leaInReg2 = 0; - MachineInstr *InsMI2 = 0; + MachineInstr *InsMI2 = nullptr; if (Src == Src2) { // ADD16rr %reg1028, %reg1028 // just a single insert_subreg. @@ -2048,14 +2064,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // convert them to equivalent lea if the condition code register def's // are dead! if (hasLiveCondCodeDef(MI)) - return 0; + return nullptr; MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. const MachineOperand &Dest = MI->getOperand(0); const MachineOperand &Src = MI->getOperand(1); - MachineInstr *NewMI = NULL; + MachineInstr *NewMI = nullptr; // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When // we have better subtarget support, enable the 16-bit LEA generation here. // 16-bit LEA is also slow on Core2. @@ -2066,11 +2082,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, switch (MIOpc) { case X86::SHUFPSrri: { assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!TM.getSubtarget().hasSSE2()) return 0; + if (!TM.getSubtarget().hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); - if (B != C) return 0; + if (B != C) return nullptr; unsigned M = MI->getOperand(3).getImm(); NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) .addOperand(Dest).addOperand(Src).addImm(M); @@ -2078,11 +2094,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHUFPDrri: { assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!TM.getSubtarget().hasSSE2()) return 0; + if (!TM.getSubtarget().hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); - if (B != C) return 0; + if (B != C) return nullptr; unsigned M = MI->getOperand(3).getImm(); // Convert to PSHUFD mask. @@ -2095,13 +2111,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && !MF.getRegInfo().constrainRegClass(Src.getReg(), &X86::GR64_NOSPRegClass)) - return 0; + return nullptr; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) .addOperand(Dest) @@ -2111,7 +2127,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL32ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; @@ -2121,7 +2137,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2137,10 +2153,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHL16ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); - if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest) .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); @@ -2149,7 +2165,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, default: { switch (MIOpc) { - default: return 0; + default: return nullptr; case X86::INC64r: case X86::INC32r: case X86::INC64_32r: { @@ -2161,7 +2177,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2175,7 +2191,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::INC16r: case X86::INC64_16r: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), 1); @@ -2192,7 +2209,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2207,7 +2224,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::DEC16r: case X86::DEC64_16r: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), -1); @@ -2228,7 +2246,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; const MachineOperand &Src2 = MI->getOperand(2); bool isKill2, isUndef2; @@ -2236,7 +2254,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, SrcReg2, isKill2, isUndef2, ImplicitOp2)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest); @@ -2258,7 +2276,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD16rr: case X86::ADD16rr_DB: { if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); @@ -2297,7 +2316,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, isUndef, ImplicitOp)) - return 0; + return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) @@ -2313,7 +2332,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD16ri_DB: case X86::ADD16ri8_DB: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), @@ -2323,7 +2343,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } } - if (!NewMI) return 0; + if (!NewMI) return nullptr; if (LV) { // Update live variables if (Src.isKill()) @@ -2450,6 +2470,41 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } } +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + switch (MI->getOpcode()) { + case X86::VFMADDPDr231r: + case X86::VFMADDPSr231r: + case X86::VFMADDSDr231r: + case X86::VFMADDSSr231r: + case X86::VFMSUBPDr231r: + case X86::VFMSUBPSr231r: + case X86::VFMSUBSDr231r: + case X86::VFMSUBSSr231r: + case X86::VFNMADDPDr231r: + case X86::VFNMADDPSr231r: + case X86::VFNMADDSDr231r: + case X86::VFNMADDSSr231r: + case X86::VFNMSUBPDr231r: + case X86::VFNMSUBPSr231r: + case X86::VFNMSUBSDr231r: + case X86::VFNMSUBSSr231r: + case X86::VFMADDPDr231rY: + case X86::VFMADDPSr231rY: + case X86::VFMSUBPDr231rY: + case X86::VFMSUBPSr231rY: + case X86::VFNMADDPDr231rY: + case X86::VFNMADDPSr231rY: + case X86::VFNMSUBPDr231rY: + case X86::VFNMSUBPSr231rY: + SrcOpIdx1 = 2; + SrcOpIdx2 = 3; + return true; + default: + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + } +} + static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; @@ -2736,15 +2791,15 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // If the block has any instructions after a JMP, delete them. - while (llvm::next(I) != MBB.end()) - llvm::next(I)->eraseFromParent(); + while (std::next(I) != MBB.end()) + std::next(I)->eraseFromParent(); Cond.clear(); - FBB = 0; + FBB = nullptr; // Delete the JMP if it's equivalent to a fall-through. if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { - TBB = 0; + TBB = nullptr; I->eraseFromParent(); I = MBB.end(); UnCondBrIter = MBB.end(); @@ -3013,6 +3068,11 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } +inline static bool MaskRegClassContains(unsigned Reg) { + return X86::VK8RegClass.contains(Reg) || + X86::VK16RegClass.contains(Reg) || + X86::VK1RegClass.contains(Reg); +} static unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { if (X86::VR128XRegClass.contains(DestReg, SrcReg) || @@ -3022,11 +3082,23 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if ((X86::VK8RegClass.contains(DestReg) || - X86::VK16RegClass.contains(DestReg)) && - (X86::VK8RegClass.contains(SrcReg) || - X86::VK16RegClass.contains(SrcReg))) + if (MaskRegClassContains(DestReg) && + MaskRegClassContains(SrcReg)) return X86::KMOVWkk; + if (MaskRegClassContains(DestReg) && + (X86::GR32RegClass.contains(SrcReg) || + X86::GR16RegClass.contains(SrcReg) || + X86::GR8RegClass.contains(SrcReg))) { + SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); + return X86::KMOVWkr; + } + if ((X86::GR32RegClass.contains(DestReg) || + X86::GR16RegClass.contains(DestReg) || + X86::GR8RegClass.contains(DestReg)) && + MaskRegClassContains(SrcReg)) { + DestReg = getX86SubSuperRegister(DestReg, MVT::i32); + return X86::KMOVWrk; + } return 0; } @@ -3114,7 +3186,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetMachine &TM, bool load) { if (TM.getSubtarget().hasAVX512()) { - if (X86::VK8RegClass.hasSubClassEq(RC) || + if (X86::VK8RegClass.hasSubClassEq(RC) || X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) @@ -3166,7 +3238,8 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); + assert((X86::VR128RegClass.hasSubClassEq(RC) || + X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? @@ -3178,7 +3251,8 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); } case 32: - assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); + assert((X86::VR256RegClass.hasSubClassEq(RC) || + X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; @@ -3553,7 +3627,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // We are searching for an earlier instruction that can make CmpInstr // redundant and that instruction will be saved in Sub. - MachineInstr *Sub = NULL; + MachineInstr *Sub = nullptr; const TargetRegisterInfo *TRI = &getRegisterInfo(); // We iterate backward, starting from the instruction before CmpInstr and @@ -3566,7 +3640,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, RE = CmpInstr->getParent() == MI->getParent() ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : CmpInstr->getParent()->rend(); - MachineInstr *Movr0Inst = 0; + MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { MachineInstr *Instr = &*RI; // Check whether CmpInstr can be made redundant by the current instruction. @@ -3741,19 +3815,19 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) - return 0; + return nullptr; // To be conservative, if there exists another load, clear the load candidate. if (MI->mayLoad()) { FoldAsLoadDefReg = 0; - return 0; + return nullptr; } // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); bool SawStore = false; - if (!DefMI->isSafeToMove(this, 0, SawStore)) - return 0; + if (!DefMI->isSafeToMove(this, nullptr, SawStore)) + return nullptr; // We try to commute MI if possible. unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; @@ -3770,12 +3844,12 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, continue; // Do not fold if we have a subreg use or a def or multiple uses. if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) - return 0; + return nullptr; SrcOperandId = i; FoundSrcOperand = true; } - if (!FoundSrcOperand) return 0; + if (!FoundSrcOperand) return nullptr; // Check whether we can fold the def into SrcOperandId. SmallVector Ops; @@ -3789,22 +3863,22 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, if (Idx == 1) { // MI was changed but it didn't help, commute it back! commuteInstruction(MI, false); - return 0; + return nullptr; } // Check whether we can commute MI and enable folding. if (MI->isCommutable()) { MachineInstr *NewMI = commuteInstruction(MI, false); // Unable to commute. - if (!NewMI) return 0; + if (!NewMI) return nullptr; if (NewMI != MI) { // New instruction. It doesn't need to be kept. NewMI->eraseFromParent(); - return 0; + return nullptr; } } } - return 0; + return nullptr; } /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr @@ -3833,6 +3907,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = TM.getSubtarget().hasAVX(); MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); switch (MI->getOpcode()) { + case X86::MOV32r0: + return Expand2AddrUndef(MIB, get(X86::XOR32rr)); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -3857,6 +3933,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + case X86::KSET0B: case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); case X86::KSET1B: case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); @@ -3936,7 +4013,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl &MOs, unsigned Size, unsigned Align) const { - const DenseMap > *OpcodeTablePtr = 0; + const DenseMap > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = TM.getSubtarget().callRegIndirect(); bool isTwoAddrFold = false; @@ -3944,7 +4022,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // when X86Subtarget is Atom. if (isCallRegIndirect && (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { - return NULL; + return nullptr; } unsigned NumOps = MI->getDesc().getNumOperands(); @@ -3955,9 +4033,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. if (MI->getOpcode() == X86::ADD32ri && MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return NULL; + return nullptr; - MachineInstr *NewMI = NULL; + MachineInstr *NewMI = nullptr; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. @@ -3992,7 +4070,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned Opcode = I->second.first; unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; if (Align < MinAlign) - return NULL; + return nullptr; bool NarrowToMOV32rm = false; if (Size) { unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); @@ -4000,12 +4078,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) - return NULL; + return nullptr; // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is due // to liveintervalanalysis remat'ing a load from stack slot. if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) - return NULL; + return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; } @@ -4034,7 +4112,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; - return NULL; + return nullptr; } /// hasPartialRegUpdate - Return true for all instructions that only update @@ -4073,20 +4151,6 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::RSQRTSSr_Int: case X86::SQRTSSr: case X86::SQRTSSr_Int: - // AVX encoded versions - case X86::VCVTSD2SSrr: - case X86::Int_VCVTSD2SSrr: - case X86::VCVTSS2SDrr: - case X86::Int_VCVTSS2SDrr: - case X86::VCVTSD2SSZrr: - case X86::VCVTSS2SDZrr: - case X86::VRCPSSr: - case X86::VROUNDSDr: - case X86::VROUNDSDr_Int: - case X86::VROUNDSSr: - case X86::VROUNDSSr_Int: - case X86::VRSQRTSSr: - case X86::VSQRTSSr: return true; } @@ -4118,10 +4182,77 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, return 16; } +// Return true for any instruction the copies the high bits of the first source +// operand into the unused high bits of the destination operand. +static bool hasUndefRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::VCVTSI2SSrr: + case X86::Int_VCVTSI2SSrr: + case X86::VCVTSI2SS64rr: + case X86::Int_VCVTSI2SS64rr: + case X86::VCVTSI2SDrr: + case X86::Int_VCVTSI2SDrr: + case X86::VCVTSI2SD64rr: + case X86::Int_VCVTSI2SD64rr: + case X86::VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrr: + case X86::VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrr: + case X86::VRCPSSr: + case X86::VROUNDSDr: + case X86::VROUNDSDr_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSr_Int: + case X86::VRSQRTSSr: + case X86::VSQRTSSr: + + // AVX-512 + case X86::VCVTSD2SSZrr: + case X86::VCVTSS2SDZrr: + return true; + } + + return false; +} + +/// Inform the ExeDepsFix pass how many idle instructions we would like before +/// certain undef register reads. +/// +/// This catches the VCVTSI2SD family of instructions: +/// +/// vcvtsi2sdq %rax, %xmm0, %xmm14 +/// +/// We should to be careful *not* to catch VXOR idioms which are presumably +/// handled specially in the pipeline: +/// +/// vxorps %xmm1, %xmm1, %xmm1 +/// +/// Like getPartialRegUpdateClearance, this makes a strong assumption that the +/// high bits that are passed-through are not live. +unsigned X86InstrInfo:: +getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const { + if (!hasUndefRegUpdate(MI->getOpcode())) + return 0; + + // Set the OpNum parameter to the first source operand. + OpNum = 1; + + const MachineOperand &MO = MI->getOperand(OpNum); + if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + // Use the same magic number as getPartialRegUpdateClearance. + return 16; + } + return 0; +} + void X86InstrInfo:: breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { unsigned Reg = MI->getOperand(OpNum).getReg(); + // If MI kills this register, the false dependence is already broken. + if (MI->killsRegister(Reg, TRI)) + return; if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. @@ -4141,19 +4272,19 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, MI->addRegisterKilled(Reg, TRI, true); } -MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl &Ops, - int FrameIndex) const { +MachineInstr* +X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const { // Check switch flag - if (NoFusing) return NULL; + if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls if (!MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) - return 0; + return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); unsigned Size = MFI->getObjectSize(FrameIndex); @@ -4166,7 +4297,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned NewOpc = 0; unsigned RCSize = 0; switch (MI->getOpcode()) { - default: return NULL; + default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; @@ -4175,12 +4306,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Size < RCSize) - return NULL; + return nullptr; // Change to CMPXXri r, 0 first. MI->setDesc(get(NewOpc)); MI->getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) - return NULL; + return nullptr; SmallVector MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); @@ -4191,15 +4322,21 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl &Ops, MachineInstr *LoadMI) const { + // If loading from a FrameIndex, fold directly from the FrameIndex. + unsigned NumOps = LoadMI->getDesc().getNumOperands(); + int FrameIndex; + if (isLoadFromStackSlot(LoadMI, FrameIndex)) + return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); + // Check switch flag - if (NoFusing) return NULL; + if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls if (!MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) - return 0; + return nullptr; // Determine the alignment of the load. unsigned Alignment = 0; @@ -4222,12 +4359,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 4; break; default: - return 0; + return nullptr; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; switch (MI->getOpcode()) { - default: return NULL; + default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; @@ -4237,12 +4374,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MI->setDesc(get(NewOpc)); MI->getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) - return NULL; + return nullptr; // Make sure the subregisters match. // Otherwise we risk changing the size of the load. if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) - return NULL; + return nullptr; SmallVector MOs; switch (LoadMI->getOpcode()) { @@ -4258,7 +4395,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Medium and large mode can't fold loads this way. if (TM.getCodeModel() != CodeModel::Small && TM.getCodeModel() != CodeModel::Kernel) - return NULL; + return nullptr; // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; @@ -4270,7 +4407,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // This doesn't work for several reasons. // 1. GlobalBaseReg may have been spilled. // 2. It may not be live at MI. - return NULL; + return nullptr; } // Create a constant-pool entry. @@ -4306,17 +4443,16 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, > 4) // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes). - return NULL; + return nullptr; if ((LoadMI->getOpcode() == X86::MOVSDrm || LoadMI->getOpcode() == X86::VMOVSDrm) && MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() > 8) // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes). - return NULL; + return nullptr; // Folding a normal load. Just copy the load's address operands. - unsigned NumOps = LoadMI->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) MOs.push_back(LoadMI->getOperand(i)); break; @@ -4360,7 +4496,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. - const DenseMap > *OpcodeTablePtr = 0; + const DenseMap > *OpcodeTablePtr = nullptr; if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; } else if (OpNum == 0) { // If operand 0 @@ -4542,7 +4679,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, AddrOps.push_back(Chain); // Emit the load instruction. - SDNode *Load = 0; + SDNode *Load = nullptr; if (FoldedLoad) { EVT VT = *RC->vt_begin(); std::pair VTs; - const TargetRegisterClass *DstRC = 0; + const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { DstRC = getRegClass(MCID, 0, &RI, MF); VTs.push_back(*DstRC->vt_begin()); @@ -5045,7 +5182,13 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, - { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr } + { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, + { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr}, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, + { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, + { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm} }; // FIXME: Some shuffle and unpack instructions have equivalents in different @@ -5055,14 +5198,14 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) if (ReplaceableInstrs[i][domain-1] == opcode) return ReplaceableInstrs[i]; - return 0; + return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) if (ReplaceableInstrsAVX2[i][domain-1] == opcode) return ReplaceableInstrsAVX2[i]; - return 0; + return nullptr; } std::pair @@ -5188,7 +5331,7 @@ namespace { static char ID; CGBR() : MachineFunctionPass(ID) {} - virtual bool runOnMachineFunction(MachineFunction &MF) { + bool runOnMachineFunction(MachineFunction &MF) override { const X86TargetMachine *TM = static_cast(&MF.getTarget()); @@ -5235,11 +5378,11 @@ namespace { return true; } - virtual const char *getPassName() const { + const char *getPassName() const override { return "X86 PIC Global Base Reg Initialization"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -5255,7 +5398,7 @@ namespace { static char ID; LDTLSCleanup() : MachineFunctionPass(ID) {} - virtual bool runOnMachineFunction(MachineFunction &MF) { + bool runOnMachineFunction(MachineFunction &MF) override { X86MachineFunctionInfo* MFI = MF.getInfo(); if (MFI->getNumLocalDynamicTLSAccesses() < 2) { // No point folding accesses if there isn't at least two. @@ -5348,11 +5491,11 @@ namespace { return Copy; } - virtual const char *getPassName() const { + const char *getPassName() const override { return "Local Dynamic TLS Access Clean-up"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU);