{ X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
{ X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
+ { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
+ { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
{ X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
+ { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
+ { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
// AVX 256-bit foldable instructions
{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
{ X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
{ X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
+ { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
+ { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
+ { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
+ { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
+ { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 },
{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
+ { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
{ X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::VPABSBrr128, X86::VPABSBrm128, 0 },
{ X86::VPABSDrr128, X86::VPABSDrm128, 0 },
{ X86::VPABSWrr128, X86::VPABSWrm128, 0 },
+ { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
+ { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
+ { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
+ { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
+ { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 },
{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VPTESTrr, X86::VPTESTrm, 0 },
{ X86::VRCPPSr, X86::VRCPPSm, 0 },
{ X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },
{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },
{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
+ { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
{ X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
+ { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
{ X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
{ X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
- { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 },
+ { X86::PINSRBrr, X86::PINSRBrm, 0 },
+ { X86::PINSRDrr, X86::PINSRDrm, 0 },
+ { X86::PINSRQrr, X86::PINSRQrm, 0 },
+ { X86::PINSRWrri, X86::PINSRWrmi, 0 },
{ X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
{ X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
{ X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
{ X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
+ { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
{ X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
{ X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
+ { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
+ { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
{ X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
{ X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
{ X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
{ X86::VPANDrr, X86::VPANDrm, 0 },
{ X86::VPAVGBrr, X86::VPAVGBrm, 0 },
{ X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
{ X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
{ X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
{ X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPINSRBrr, X86::VPINSRBrm, 0 },
+ { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
+ { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
{ X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
{ X86::VPSRLWrr, X86::VPSRLWrm, 0 },
{ X86::VPSUBBrr, X86::VPSUBBrm, 0 },
{ X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
{ X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
{ X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
+ { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
{ X86::VPSUBWrr, X86::VPSUBWrm, 0 },
{ X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
{ X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "x86_64-apple-macosx10.6.6"
-
-; Test that the order of operands is correct
-; CHECK: select_func
-; CHECK: pblendvb %xmm1, %xmm2
-; CHECK: ret
-
-define void @select_func(<8 x i16> %in) {
+\r
+; Test that the order of operands is correct\r
+; CHECK: select_func\r
+; CHECK: pblendvb {{LCPI0_[0-9]*}}(%rip), %xmm1\r
+; CHECK: ret\r
+\r
+define void @select_func(<8 x i16> %in) {\r
entry:
%c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
- ret void
-
-; SSE2: @test1
-; SSE2: psubusw LCPI0_0(%rip), %xmm0
-
-; AVX1: @test1
-; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
-
-; AVX2: @test1
-; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
-}
-
-define void @test2(i16* nocapture %head) nounwind {
+ ret void\r
+\r
+; SSE2: @test1\r
+; SSE2: psubusw %xmm0, %xmm1\r
+\r
+; AVX1: @test1\r
+; AVX1: vpsubusw %xmm0, %xmm1, %xmm1\r
+\r
+; AVX2: @test1\r
+; AVX2: vpsubusw %xmm0, %xmm1, %xmm1\r
+}\r
+\r
+define void @test2(i16* nocapture %head) nounwind {\r
vector.ph:
br label %vector.body
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
- ret void
-
-; SSE2: @test2
-; SSE2: psubusw LCPI1_0(%rip), %xmm0
-
-; AVX1: @test2
-; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
-
-; AVX2: @test2
-; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
-}
-
-define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
+ ret void\r
+\r
+; SSE2: @test2\r
+; SSE2: psubusw %xmm0, %xmm1\r
+\r
+; AVX1: @test2\r
+; AVX1: vpsubusw %xmm0, %xmm1, %xmm1\r
+\r
+; AVX2: @test2\r
+; AVX2: vpsubusw %xmm0, %xmm1, %xmm1\r
+}\r
+\r
+define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {\r
vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
- ret void
-
-; SSE2: @test4
-; SSE2: psubusb LCPI3_0(%rip), %xmm0
-
-; AVX1: @test4
-; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
-
-; AVX2: @test4
-; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
-}
-
-define void @test5(i8* nocapture %head) nounwind {
+ ret void\r
+\r
+; SSE2: @test4\r
+; SSE2: psubusb %xmm0, %xmm1\r
+\r
+; AVX1: @test4\r
+; AVX1: vpsubusb %xmm0, %xmm1, %xmm1\r
+\r
+; AVX2: @test4\r
+; AVX2: vpsubusb %xmm0, %xmm1, %xmm1\r
+}\r
+\r
+define void @test5(i8* nocapture %head) nounwind {\r
vector.ph:
br label %vector.body
br i1 %6, label %for.end, label %vector.body
for.end: ; preds = %vector.body
- ret void
-
-; SSE2: @test5
-; SSE2: psubusb LCPI4_0(%rip), %xmm0
-
-; AVX1: @test5
-; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
-
-; AVX2: @test5
-; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
-}
-
-define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
+ ret void\r
+\r
+; SSE2: @test5\r
+; SSE2: psubusb %xmm0, %xmm1\r
+\r
+; AVX1: @test5\r
+; AVX1: vpsubusb %xmm0, %xmm1, %xmm1\r
+\r
+; AVX2: @test5\r
+; AVX2: vpsubusb %xmm0, %xmm1, %xmm1\r
+}\r
+\r
+define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {\r
vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+aes < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+aes,+pclmul < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
}
declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
+define <4 x i32> @stack_fold_movd_load(i32 %a0) {
+ ;CHECK-LABEL: stack_fold_movd_load
+ ;CHECK: movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
+ ret <4 x i32> %2
+}
+
+define i32 @stack_fold_movd_store(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_movd_store
+ ;CHECK: movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+ %1 = extractelement <4 x i32> %a0, i32 0
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ ret i32 %1
+}
+
+define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
+ ;CHECK-LABEL: stack_fold_movq_load
+ ;CHECK: movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
+ ret <2 x i64> %2
+}
+
+define i64 @stack_fold_movq_store(<2 x i64> %a0) {
+ ;CHECK-LABEL: stack_fold_movq_store
+ ;CHECK: movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+ %1 = extractelement <2 x i64> %a0, i32 0
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ ret i64 %1
+}
+
define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pabsb
;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
-; TODO stack_fold_pblendvb
+define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
+ ;CHECK-LABEL: stack_fold_pblendvb
+ ;CHECK: vpblendvb {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
+ ret <16 x i8> %2
+}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
-; TODO stack_fold_pclmulqdq
+define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pclmulqdq
+ ;CHECK: vpclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
ret <8 x i16> %3
}
-; TODO stack_fold_pcmpestri
+define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pcmpestri
+ ;CHECK: vpcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
+ %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+ ret i32 %2
+}
declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-; TODO stack_fold_pcmpestrm
+define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pcmpestrm
+ ;CHECK: vpcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
+ ret <16 x i8> %2
+}
declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
ret <8 x i16> %3
}
-; TODO stack_fold_pcmpistri
+define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pcmpistri
+ ;CHECK: vpcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+ ret i32 %2
+}
declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-; TODO stack_fold_pcmpistrm
+define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pcmpistrm
+ ;CHECK: vpcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
+ ret <16 x i8> %2
+}
declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
; TODO stack_fold_pextrb
-; TODO stack_fold_pextrd
-; TODO stack_fold_pextrq
+
+define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_pextrd
+ ;CHECK: pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+ ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+ %1 = extractelement <4 x i32> %a0, i32 1
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ ret i32 %1
+}
+
+define i64 @stack_fold_pextrq(<2 x i64> %a0) {
+ ;CHECK-LABEL: stack_fold_pextrq
+ ;CHECK: pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+ ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
+ %1 = extractelement <2 x i64> %a0, i32 1
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ ret i64 %1
+}
+
; TODO stack_fold_pextrw
define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
}
declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-; TODO stack_fold_phminposuw
+define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_phminposuw
+ ;CHECK: vphminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
+ ret <8 x i16> %2
+}
declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
}
declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-; TODO stack_fold_pinsrb
-; TODO stack_fold_pinsrd
-; TODO stack_fold_pinsrq
-; TODO stack_fold_pinsrw
+define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
+ ;CHECK-LABEL: stack_fold_pinsrb
+ ;CHECK: vpinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
+ ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
+ ;CHECK-LABEL: stack_fold_pinsrd
+ ;CHECK: vpinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
+ ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
+ ;CHECK-LABEL: stack_fold_pinsrq
+ ;CHECK: vpinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
+ ret <2 x i64> %2
+}
+
+define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
+ ;CHECK-LABEL: stack_fold_pinsrw
+ ;CHECK: vpinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
+ ret <8 x i16> %2
+}
define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pmaddubsw
}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-; TODO stack_fold_pmovsxbd
+define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovsxbd
+ ;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
+ ret <4 x i32> %2
+}
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
-; TODO stack_fold_pmovsxbq
+define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovsxbq
+ ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
-; TODO stack_fold_pmovsxbw
+define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovsxbw
+ ;CHECK: vpmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
+ ret <8 x i16> %2
+}
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
-; TODO stack_fold_pmovsxdq
+define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovsxdq
+ ;CHECK: vpmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
-; TODO stack_fold_pmovsxwd
+define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovsxwd
+ ;CHECK: vpmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
+ ret <4 x i32> %2
+}
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
-; TODO stack_fold_pmovsxwq
+define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovsxwq
+ ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
-; TODO stack_fold_pmovzxbd
+define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovzxbd
+ ;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0)
+ ret <4 x i32> %2
+}
declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
-; TODO stack_fold_pmovzxbq
+define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovzxbq
+ ;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-; TODO stack_fold_pmovzxbw
+define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovzxbw
+ ;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0)
+ ret <8 x i16> %2
+}
declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-; TODO stack_fold_pmovzxdq
+define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovzxdq
+ ;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
-; TODO stack_fold_pmovzxwd
+define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovzxwd
+ ;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0)
+ ret <4 x i32> %2
+}
declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
-; TODO stack_fold_pmovzxwq
+define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_pmovzxwq
+ ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0)
+ ret <2 x i64> %2
+}
declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
ret <4 x i32> %2
}
-; TODO stack_fold_psubq
+define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_psubq
+ ;CHECK: vpsubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = sub <2 x i64> %a0, %a1
+ ret <2 x i64> %2
+}
define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_psubsb
}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-; TODO stack_fold_psubusb
+define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_psubusb
+ ;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %2
+}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-; TODO stack_fold_psubusw
+define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_psubusw
+ ;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %2
+}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
ret <8 x i16> %2
}
-; TODO stack_fold_ptest
+define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_ptest
+ ;CHECK: vptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %2
+}
declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-unknown"
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s\r
+\r
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"\r
+target triple = "x86_64-unknown-unknown"\r
; Stack reload folding tests.
;
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
ret <2 x i64> %2
-}
-declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
-
-define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pabsb
- ;CHECK: pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+}\r
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_movd_load(i32 %a0) {\r
+ ;CHECK-LABEL: stack_fold_movd_load\r
+ ;CHECK: movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0\r
+ ret <4 x i32> %2\r
+}\r
+\r
+define i32 @stack_fold_movd_store(<4 x i32> %a0) {\r
+ ;CHECK-LABEL: stack_fold_movd_store\r
+ ;CHECK: movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill\r
+ %1 = extractelement <4 x i32> %a0, i32 0\r
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ ret i32 %1\r
+}\r
+\r
+define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {\r
+ ;CHECK-LABEL: stack_fold_movq_load\r
+ ;CHECK: movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>\r
+ ret <2 x i64> %2\r
+}\r
+\r
+define i64 @stack_fold_movq_store(<2 x i64> %a0) {\r
+ ;CHECK-LABEL: stack_fold_movq_store\r
+ ;CHECK: movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill\r
+ %1 = extractelement <2 x i64> %a0, i32 0\r
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ ret i64 %1\r
+}\r
+\r
+define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pabsb\r
+ ;CHECK: pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
ret <16 x i8> %2
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
-}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-; TODO stack_fold_pblendvb
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+}\r
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {\r
+ ;CHECK-LABEL: stack_fold_pblendvb\r
+ ;CHECK: pblendvb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)\r
+ ret <16 x i8> %2\r
+}\r
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {\r
;CHECK-LABEL: stack_fold_pblendw
;CHECK: pblendw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)
ret <8 x i16> %2
-}
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
-
-; TODO stack_fold_pclmulqdq
-declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
-
-define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
+}\r
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {\r
+ ;CHECK-LABEL: stack_fold_pclmulqdq\r
+ ;CHECK: pclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {\r
;CHECK-LABEL: stack_fold_pcmpeqb
;CHECK: pcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>
- ret <8 x i16> %3
-}
-
-; TODO stack_fold_pcmpestri
-declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-; TODO stack_fold_pcmpestrm
-declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
+ ret <8 x i16> %3\r
+}\r
+\r
+define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_pcmpestri\r
+ ;CHECK: pcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()\r
+ %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_pcmpestrm\r
+ ;CHECK: pcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()\r
+ %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)\r
+ ret <16 x i8> %2\r
+}\r
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {\r
;CHECK-LABEL: stack_fold_pcmpgtb
;CHECK: pcmpgtb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>
- ret <8 x i16> %3
-}
-
-; TODO stack_fold_pcmpistri
-declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-; TODO stack_fold_pcmpistrm
-declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-; TODO stack_fold_pextrb
-; TODO stack_fold_pextrd
-; TODO stack_fold_pextrq
-; TODO stack_fold_pextrw
-
-define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
+ ret <8 x i16> %3\r
+}\r
+\r
+define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_pcmpistri\r
+ ;CHECK: pcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_pcmpistrm\r
+ ;CHECK: pcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)\r
+ ret <16 x i8> %2\r
+}\r
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone\r
+\r
+; TODO stack_fold_pextrb\r
+\r
+define i32 @stack_fold_pextrd(<4 x i32> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pextrd\r
+ ;CHECK: pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill\r
+ ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload\r
+ %1 = extractelement <4 x i32> %a0, i32 1\r
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ ret i32 %1\r
+}\r
+\r
+define i64 @stack_fold_pextrq(<2 x i64> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pextrq\r
+ ;CHECK: pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill\r
+ ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload\r
+ %1 = extractelement <2 x i64> %a0, i32 1\r
+ %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ ret i64 %1\r
+}\r
+\r
+; TODO stack_fold_pextrw\r
+\r
+define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {\r
;CHECK-LABEL: stack_fold_phaddd
;CHECK: phaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
-}
-declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-; TODO stack_fold_phminposuw
-declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
-
-define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
+}\r
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_phminposuw\r
+ ;CHECK: phminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)\r
+ ret <8 x i16> %2\r
+}\r
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {\r
;CHECK-LABEL: stack_fold_phsubd
;CHECK: phsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
-}
-declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-; TODO stack_fold_pinsrb
-; TODO stack_fold_pinsrd
-; TODO stack_fold_pinsrq
-; TODO stack_fold_pinsrw
-
-define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddubsw
+}\r
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {\r
+ ;CHECK-LABEL: stack_fold_pinsrb\r
+ ;CHECK: pinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1\r
+ ret <16 x i8> %2\r
+}\r
+\r
+define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {\r
+ ;CHECK-LABEL: stack_fold_pinsrd\r
+ ;CHECK: pinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1\r
+ ret <4 x i32> %2\r
+}\r
+\r
+define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {\r
+ ;CHECK-LABEL: stack_fold_pinsrq\r
+ ;CHECK: pinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1\r
+ ret <2 x i64> %2\r
+}\r
+\r
+define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {\r
+ ;CHECK-LABEL: stack_fold_pinsrw\r
+ ;CHECK: pinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()\r
+ %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1\r
+ ret <8 x i16> %2\r
+}\r
+\r
+define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_pmaddubsw\r
;CHECK: pmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
-}
-declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-; TODO stack_fold_pmovsxbd
-declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
-
-; TODO stack_fold_pmovsxbq
-declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
-
-; TODO stack_fold_pmovsxbw
-declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
-
-; TODO stack_fold_pmovsxdq
-declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
-
-; TODO stack_fold_pmovsxwd
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
-
-; TODO stack_fold_pmovsxwq
-declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
-
-; TODO stack_fold_pmovzxbd
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
-
-; TODO stack_fold_pmovzxbq
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-
-; TODO stack_fold_pmovzxbw
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
-; TODO stack_fold_pmovzxdq
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
-
-; TODO stack_fold_pmovzxwd
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
-
-; TODO stack_fold_pmovzxwq
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
-
-define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
+}\r
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovsxbd\r
+ ;CHECK: pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)\r
+ ret <4 x i32> %2\r
+}\r
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovsxbq\r
+ ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovsxbw\r
+ ;CHECK: pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)\r
+ ret <8 x i16> %2\r
+}\r
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovsxdq\r
+ ;CHECK: pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovsxwd\r
+ ;CHECK: pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)\r
+ ret <4 x i32> %2\r
+}\r
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovsxwq\r
+ ;CHECK: pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovzxbd\r
+ ;CHECK: pmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0)\r
+ ret <4 x i32> %2\r
+}\r
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovzxbq\r
+ ;CHECK: pmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovzxbw\r
+ ;CHECK: pmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0)\r
+ ret <8 x i16> %2\r
+}\r
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovzxdq\r
+ ;CHECK: pmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovzxwd\r
+ ;CHECK: pmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0)\r
+ ret <4 x i32> %2\r
+}\r
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_pmovzxwq\r
+ ;CHECK: pmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0)\r
+ ret <2 x i64> %2\r
+}\r
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone\r
+\r
+define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {\r
;CHECK-LABEL: stack_fold_pmuldq
;CHECK: pmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
;CHECK: psubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <4 x i32> %a0, %a1
- ret <4 x i32> %2
-}
-
-; TODO stack_fold_psubq
-
-define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubsb
+ ret <4 x i32> %2\r
+}\r
+\r
+define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {\r
+ ;CHECK-LABEL: stack_fold_psubq\r
+ ;CHECK: psubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = sub <2 x i64> %a0, %a1\r
+ ret <2 x i64> %2\r
+}\r
+\r
+define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_psubsb\r
;CHECK: psubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
-}
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-; TODO stack_fold_psubusb
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-; TODO stack_fold_psubusw
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
+}\r
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {\r
+ ;CHECK-LABEL: stack_fold_psubusb\r
+ ;CHECK: psubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)\r
+ ret <16 x i8> %2\r
+}\r
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {\r
+ ;CHECK-LABEL: stack_fold_psubusw\r
+ ;CHECK: psubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)\r
+ ret <8 x i16> %2\r
+}\r
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {\r
;CHECK-LABEL: stack_fold_psubw
;CHECK: psubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <8 x i16> %a0, %a1
- ret <8 x i16> %2
-}
-
-; TODO stack_fold_ptest
-declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
+ ret <8 x i16> %2\r
+}\r
+\r
+define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {\r
+ ;CHECK-LABEL: stack_fold_ptest\r
+ ;CHECK: ptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone\r
+\r
+define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {\r
;CHECK-LABEL: stack_fold_punpckhbw
;CHECK: punpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()