From a348c56fdee38b4d52c4e54ca9d8bea799dda345 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Thu, 22 Mar 2007 18:42:45 +0000 Subject: [PATCH] Support added for shifts and unpacking MMX instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@35266 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IntrinsicsX86.td | 38 ++++++++++ lib/Target/X86/README-MMX.txt | 59 ++++++++++++++++ lib/Target/X86/README-SSE.txt | 40 +++++++++++ lib/Target/X86/X86ISelLowering.cpp | 6 +- lib/Target/X86/X86InstrMMX.td | 108 ++++++++++++++++++++++++++++- 5 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 lib/Target/X86/README-MMX.txt diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index 3af8f9f48c4..2984f79a5a8 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -585,3 +585,41 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v2i32_ty, llvm_v4i16_ty, llvm_v4i16_ty], [IntrNoMem]>; } + +// Integer shift ops. +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + // Shift left logical + def int_x86_mmx_psll_w : GCCBuiltin<"__builtin_ia32_psllw">, + Intrinsic<[llvm_v8i8_ty, llvm_v8i8_ty, + llvm_v2i32_ty], [IntrNoMem]>; + def int_x86_mmx_psll_d : GCCBuiltin<"__builtin_ia32_pslld">, + Intrinsic<[llvm_v4i16_ty, llvm_v4i16_ty, + llvm_v2i32_ty], [IntrNoMem]>; + def int_x86_mmx_psll_q : GCCBuiltin<"__builtin_ia32_psllq">, + Intrinsic<[llvm_v2i32_ty, llvm_v2i32_ty, + llvm_v2i32_ty], [IntrNoMem]>; + + def int_x86_mmx_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw">, + Intrinsic<[llvm_v4i16_ty, llvm_v4i16_ty, + llvm_v2i32_ty], [IntrNoMem]>; + def int_x86_mmx_psrl_d : GCCBuiltin<"__builtin_ia32_psrld">, + Intrinsic<[llvm_v2i32_ty, llvm_v2i32_ty, + llvm_v2i32_ty], [IntrNoMem]>; + def int_x86_mmx_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq">, + Intrinsic<[llvm_v2i32_ty, llvm_v2i32_ty, + llvm_v2i32_ty], [IntrNoMem]>; + + def int_x86_mmx_psra_w : GCCBuiltin<"__builtin_ia32_psraw">, + Intrinsic<[llvm_v8i8_ty, llvm_v8i8_ty, + llvm_v2i32_ty], [IntrNoMem]>; + def int_x86_mmx_psra_d : GCCBuiltin<"__builtin_ia32_psrad">, + Intrinsic<[llvm_v4i16_ty, llvm_v4i16_ty, + llvm_v2i32_ty], [IntrNoMem]>; +} + +// Vector pack/unpack ops. +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_mmx_punpckh_dq : GCCBuiltin<"__builtin_ia32_punpckhdq">, + Intrinsic<[llvm_v2i32_ty, llvm_v2i32_ty, + llvm_v2i32_ty], [IntrNoMem]>; +} diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt new file mode 100644 index 00000000000..b4886aed273 --- /dev/null +++ b/lib/Target/X86/README-MMX.txt @@ -0,0 +1,59 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: MMX-specific stuff. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +We should compile + +#include + +extern __m64 C; + +void baz(__v2si *A, __v2si *B) +{ + *A = __builtin_ia32_psllq(*B, C); + _mm_empty(); +} + +to: + +.globl _baz +_baz: + call L3 +"L00000000001$pb": +L3: + popl %ecx + subl $12, %esp + movl 20(%esp), %eax + movq (%eax), %mm0 + movl L_C$non_lazy_ptr-"L00000000001$pb"(%ecx), %eax + movq (%eax), %mm1 + movl 16(%esp), %eax + psllq %mm1, %mm0 + movq %mm0, (%eax) + emms + addl $12, %esp + ret + +not: + +_baz: + subl $12, %esp + call "L1$pb" +"L1$pb": + popl %eax + movl L_C$non_lazy_ptr-"L1$pb"(%eax), %eax + movl (%eax), %ecx + movl %ecx, (%esp) + movl 4(%eax), %eax + movl %eax, 4(%esp) + movl 20(%esp), %eax + movq (%eax), %mm0 + movq (%esp), %mm1 + psllq %mm1, %mm0 + movl 16(%esp), %eax + movq %mm0, (%eax) + emms + addl $12, %esp + ret diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 2b7f9ae4392..96e6aab2438 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -571,4 +571,44 @@ swizzle: movaps %xmm0, (%eax) ret +//===---------------------------------------------------------------------===// + +We should compile this: + +#include + +void foo(__m128i *A, __m128i *B) { + *A = _mm_sll_epi16 (*A, *B); +} + +to: + +_foo: + subl $12, %esp + movl 16(%esp), %edx + movl 20(%esp), %eax + movdqa (%edx), %xmm1 + movdqa (%eax), %xmm0 + psllw %xmm0, %xmm1 + movdqa %xmm1, (%edx) + addl $12, %esp + ret +not: + +_foo: + movl 8(%esp), %eax + movdqa (%eax), %xmm0 + #IMPLICIT_DEF %eax + pinsrw $2, %eax, %xmm0 + xorl %ecx, %ecx + pinsrw $3, %ecx, %xmm0 + pinsrw $4, %eax, %xmm0 + pinsrw $5, %ecx, %xmm0 + pinsrw $6, %eax, %xmm0 + pinsrw $7, %ecx, %xmm0 + movl 4(%esp), %eax + movdqa (%eax), %xmm1 + psllw %xmm0, %xmm1 + movdqa %xmm1, (%eax) + ret diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e178646e992..2f9763d6f96 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -355,6 +355,10 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Expand); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Expand); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Expand); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); } if (Subtarget->hasSSE1()) { @@ -2312,7 +2316,7 @@ static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros, return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V); } -/// LowerBuildVectorv16i8 - Custom lower build_vector of v8i16. +/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. /// static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 1d3c127532e..f6c74f7cc7c 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -44,6 +44,10 @@ def : Pat<(v2i32 (undef)), (IMPLICIT_DEF_VR64)>; def loadv2i32 : PatFrag<(ops node:$ptr), (v2i32 (load node:$ptr))>; +def bc_v8i8 : PatFrag<(ops node:$in), (v8i8 (bitconvert node:$in))>; +def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>; +def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>; + //===----------------------------------------------------------------------===// // MMX Multiclasses //===----------------------------------------------------------------------===// @@ -94,13 +98,28 @@ let isTwoAddress = 1 in { [(set VR64:$dst, (OpNode VR64:$src1,(loadv2i32 addr:$src2)))]>; } + + multiclass MMXI_binop_rmi_int opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId> { + def rr : MMXI; + def rm : MMXI; + def ri : MMXIi8; + } } //===----------------------------------------------------------------------===// // MMX EMMS Instruction //===----------------------------------------------------------------------===// -def EMMS : MMXI<0x77, RawFrm, (ops), "emms", [(int_x86_mmx_emms)]>; +def MMX_EMMS : MMXI<0x77, RawFrm, (ops), "emms", [(int_x86_mmx_emms)]>; //===----------------------------------------------------------------------===// // MMX Scalar Instructions @@ -132,6 +151,53 @@ defm MMX_PMULLW : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>; defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw" , int_x86_mmx_pmulh_w , 1>; defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>; + +def MMX_UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKHMask(N); +}]>; + +let isTwoAddress = 1 in { +def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckhbw {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v8i8 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKH_shuffle_mask)))]>; +def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckhbw {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v8i8 (vector_shuffle VR64:$src1, + (bc_v8i8 (loadv2i32 addr:$src2)), + MMX_UNPCKH_shuffle_mask)))]>; +def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckhwd {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKH_shuffle_mask)))]>; +def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckhwd {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle VR64:$src1, + (bc_v4i16 (loadv2i32 addr:$src2)), + MMX_UNPCKH_shuffle_mask)))]>; +def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckhdq {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v2i32 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKH_shuffle_mask)))]>; +def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckhdq {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v2i32 (vector_shuffle VR64:$src1, + (loadv2i32 addr:$src2), + MMX_UNPCKH_shuffle_mask)))]>; +} + // Logical Instructions defm MMX_PAND : MMXI_binop_rm_v2i32<0xDB, "pand", and, 1>; defm MMX_POR : MMXI_binop_rm_v2i32<0xEB, "por" , or, 1>; @@ -150,6 +216,26 @@ let isTwoAddress = 1 in { (load addr:$src2))))]>; } +// Shift Instructions +defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_mmx_psrl_w>; +defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_mmx_psrl_d>; +defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_mmx_psrl_q>; + +defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_mmx_psll_w>; +defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_mmx_psll_d>; +defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_mmx_psll_q>; + +defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_mmx_psra_w>; +defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_mmx_psra_d>; + // Move Instructions def MOVD64rr : MMXI<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src), "movd {$src, $dst|$dst, $src}", []>; @@ -225,3 +311,23 @@ def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 VR64:$src))), (v4i16 VR64:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 VR64:$src))), (v2i32 VR64:$src)>; + +// Splat v2i32 +let AddedComplexity = 10 in { + def : Pat<(vector_shuffle (v2i32 VR64:$src), (undef), + MMX_UNPCKH_shuffle_mask:$sm), + (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>; +} + +// FIXME: Temporary workaround because 2-wide shuffle is broken. +def : Pat<(int_x86_mmx_punpckh_dq VR64:$src1, VR64:$src2), + (v2i32 (MMX_PUNPCKHDQrr VR64:$src1, VR64:$src2))>; +def : Pat<(int_x86_mmx_punpckh_dq VR64:$src1, (load addr:$src2)), + (v2i32 (MMX_PUNPCKHDQrm VR64:$src1, addr:$src2))>; + +def MMX_X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; + +// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower 8 or +// 16-bits matter. +def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MOVD64rr GR32:$src)>; +def : Pat<(v8i8 (MMX_X86s2vec GR32:$src)), (MOVD64rr GR32:$src)>; -- 2.34.1