From e20dfebf8715a249c330b289740d4882d1396ae9 Mon Sep 17 00:00:00 2001 From: Igor Breger Date: Thu, 19 Nov 2015 08:26:56 +0000 Subject: [PATCH] AVX512: Implemented encoding, intrinsics and DAG lowering for VMOVDDUP instructions. Differential Revision: http://reviews.llvm.org/D14702 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253548 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 18 +++++ .../X86/InstPrinter/X86InstComments.cpp | 30 +++----- lib/Target/X86/X86InstrAVX512.td | 66 +++++++++++------ lib/Target/X86/X86InstrSSE.td | 27 ++++--- lib/Target/X86/X86IntrinsicsInfo.h | 6 ++ test/CodeGen/X86/avx-isa-check.ll | 23 ++++++ test/CodeGen/X86/avx512-intrinsics.ll | 24 +++++++ test/CodeGen/X86/avx512vl-intrinsics.ll | 46 ++++++++++++ test/CodeGen/X86/vector-shuffle-128-v2.ll | 51 +++++++++---- test/CodeGen/X86/vector-shuffle-256-v4.ll | 10 +++ test/MC/X86/avx512-encodings.s | 36 ++++++++++ test/MC/X86/x86-64-avx512f_vl.s | 72 +++++++++++++++++++ 12 files changed, 338 insertions(+), 71 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index df3f9931476..84cd35dd156 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1618,6 +1618,24 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_movddup_128 : + GCCBuiltin<"__builtin_ia32_movddup128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movddup_256 : + GCCBuiltin<"__builtin_ia32_movddup256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movddup_512 : + GCCBuiltin<"__builtin_ia32_movddup512_mask">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], + [IntrNoMem]>; } // Vector blend diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index e3f59fb3bfd..3bf33dc22a9 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -309,39 +309,25 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_MOVDUP(MOVSLDUP, r) Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - CASE_MOVDUP(MOVSLDUP, m) { - MVT VT = getRegOperandVectorVT(MI, MVT::f32, 0); + CASE_MOVDUP(MOVSLDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(VT, ShuffleMask); + DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - } CASE_MOVDUP(MOVSHDUP, r) Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - CASE_MOVDUP(MOVSHDUP, m) { - MVT VT = getRegOperandVectorVT(MI, MVT::f32, 0); + CASE_MOVDUP(MOVSHDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(VT, ShuffleMask); + DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - } - case X86::VMOVDDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVDDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); - break; - - case X86::MOVDDUPrr: - case X86::VMOVDDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::MOVDDUPrm: - case X86::VMOVDDUPrm: + CASE_MOVDUP(MOVDDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); break; case X86::PSLLDQri: diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 29ea0166177..6176af6af0b 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -4225,26 +4225,6 @@ multiclass avx512_pshufb_sizes opc, string OpcodeStr, SDNode OpNode> { defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; -//===----------------------------------------------------------------------===// -// AVX-512 - MOVDDUP -//===----------------------------------------------------------------------===// - -multiclass avx512_movddup { -def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; -def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, - (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; -} - -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPZrm addr:$src)>; - //===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// @@ -7128,6 +7108,52 @@ multiclass avx512_replicate opc, string OpcodeStr, SDNode OpNode>{ defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable, EVEX; + let mayLoad = 1 in + defm rm : AVX512_maskable, + EVEX, EVEX_CD8<_.EltSize, CD8VH>; +} + +multiclass avx512_movddup_common opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + + defm Z : avx512_unary_rm, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm, + EVEX_V256; + defm Z128 : avx512_movddup_128, + EVEX_V128; + } +} + +multiclass avx512_movddup opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_movddup_common, XD, VEX_W; + let isCodeGenOnly = 1 in + defm NAME#_I: avx512_movddup_common; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; + +def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + //===----------------------------------------------------------------------===// // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f39fa3f617a..401fffaa22a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5206,21 +5206,30 @@ def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (v4f64 (X86Movddup - (scalar_to_vector (loadf64 addr:$src)))))]>, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (loadv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), @@ -5228,16 +5237,6 @@ let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - - // 256-bit version - def : Pat<(X86Movddup (loadv4f64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (loadv4i64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; } let Predicates = [UseAVX, OptForSize] in { diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index b4df8dae219..4bdb5b9146e 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -798,6 +798,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FMIN, X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, X86ISD::MOVSHDUP, 0), X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, diff --git a/test/CodeGen/X86/avx-isa-check.ll b/test/CodeGen/X86/avx-isa-check.ll index e8426b67ecb..a6f817f5005 100644 --- a/test/CodeGen/X86/avx-isa-check.ll +++ b/test/CodeGen/X86/avx-isa-check.ll @@ -406,3 +406,26 @@ define void @store_h_double(<2 x double> %x, i64* %p) { ret void } +define <2 x double> @test39(double* %ptr) nounwind { + %a = load double, double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle + } + +define <2 x double> @test40(<2 x double>* %ptr) nounwind { + %v = load <2 x double>, <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle + } + +define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) { + %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> + ret <2 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index ef5199bbf8d..000495fcaa6 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -4750,3 +4750,27 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ret <16 x float> %res4 } +declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %zmm0, %zmm1 {%k1} +; CHECK-NEXT: ## zmm1 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovddup %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovddup %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) + %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res2, %res3 + ret <8 x double> %res4 +} + diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 839ae2c5eb2..c3e53e89248 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -5483,4 +5483,50 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f %res4 = fadd <8 x float> %res2, %res3 ret <8 x float> %res4 } +declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) + %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) + %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index c81ea51f21e..d1eac6e695c 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1361,27 +1361,48 @@ define <2 x double> @insert_dup_mem_v2f64(double* %ptr) { ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: insert_dup_mem_v2f64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_dup_mem_v2f64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: insert_dup_mem_v2f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovsd (%rdi), %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX512VL-NEXT: retq +; AVX-LABEL: insert_dup_mem_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX-NEXT: retq %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> ret <2 x double> %shuffle } +define <2 x double> @insert_dup_mem128_v2f64(<2 x double>* %ptr) nounwind { +; SSE2-LABEL: insert_dup_mem128_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_dup_mem128_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_dup_mem128_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_mem128_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_mem128_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX-NEXT: retq + %v = load <2 x double>, <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle +} + + define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) { ; SSE-LABEL: insert_dup_mem_v2i64: ; SSE: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 366726f75cb..fb552c41323 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -188,6 +188,16 @@ define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { ret <4 x double> %shuffle } +define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64mem_0022: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] +; ALL-NEXT: retq + %a = load <4 x double>, <4 x double>* %ptr + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_1032: ; ALL: # BB#0: diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index 01224ca84ce..818b647b917 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -18561,6 +18561,42 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2 // CHECK: encoding: [0x62,0x61,0xfd,0x08,0x17,0x8a,0xf8,0xfb,0xff,0xff] vmovhpd %xmm25, -1032(%rdx) +// CHECK: vmovddup %zmm29, %zmm5 +// CHECK: encoding: [0x62,0x91,0xff,0x48,0x12,0xed] + vmovddup %zmm29, %zmm5 + +// CHECK: vmovddup %zmm29, %zmm5 {%k4} +// CHECK: encoding: [0x62,0x91,0xff,0x4c,0x12,0xed] + vmovddup %zmm29, %zmm5 {%k4} + +// CHECK: vmovddup %zmm29, %zmm5 {%k4} {z} +// CHECK: encoding: [0x62,0x91,0xff,0xcc,0x12,0xed] + vmovddup %zmm29, %zmm5 {%k4} {z} + +// CHECK: vmovddup (%rcx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0x29] + vmovddup (%rcx), %zmm5 + +// CHECK: vmovddup 291(%rax,%r14,8), %zmm5 +// CHECK: encoding: [0x62,0xb1,0xff,0x48,0x12,0xac,0xf0,0x23,0x01,0x00,0x00] + vmovddup 291(%rax,%r14,8), %zmm5 + +// CHECK: vmovddup 8128(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0x6a,0x7f] + vmovddup 8128(%rdx), %zmm5 + +// CHECK: vmovddup 8192(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0xaa,0x00,0x20,0x00,0x00] + vmovddup 8192(%rdx), %zmm5 + +// CHECK: vmovddup -8192(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0x6a,0x80] + vmovddup -8192(%rdx), %zmm5 + +// CHECK: vmovddup -8256(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0xaa,0xc0,0xdf,0xff,0xff] + vmovddup -8256(%rdx), %zmm5 + // CHECK: vmovsd.s %xmm15, %xmm22, %xmm21 // CHECK: encoding: [0x62,0x31,0xcf,0x00,0x11,0xfd] vmovsd.s %xmm15, %xmm22, %xmm21 diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s index 1cdb0bf0a10..8031c097a1e 100644 --- a/test/MC/X86/x86-64-avx512f_vl.s +++ b/test/MC/X86/x86-64-avx512f_vl.s @@ -22123,6 +22123,78 @@ vaddpd {rz-sae}, %zmm2, %zmm1, %zmm1 // CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x82,0xe0,0xef,0xff,0xff] vmovsldup -4128(%rdx), %ymm24 +// CHECK: vmovddup %xmm23, %xmm17 +// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x12,0xcf] + vmovddup %xmm23, %xmm17 + +// CHECK: vmovddup %xmm23, %xmm17 {%k6} +// CHECK: encoding: [0x62,0xa1,0xff,0x0e,0x12,0xcf] + vmovddup %xmm23, %xmm17 {%k6} + +// CHECK: vmovddup %xmm23, %xmm17 {%k6} {z} +// CHECK: encoding: [0x62,0xa1,0xff,0x8e,0x12,0xcf] + vmovddup %xmm23, %xmm17 {%k6} {z} + +// CHECK: vmovddup (%rcx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x09] + vmovddup (%rcx), %xmm17 + +// CHECK: vmovddup 291(%rax,%r14,8), %xmm17 +// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x12,0x8c,0xf0,0x23,0x01,0x00,0x00] + vmovddup 291(%rax,%r14,8), %xmm17 + +// CHECK: vmovddup 1016(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x4a,0x7f] + vmovddup 1016(%rdx), %xmm17 + +// CHECK: vmovddup 1024(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x8a,0x00,0x04,0x00,0x00] + vmovddup 1024(%rdx), %xmm17 + +// CHECK: vmovddup -1024(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x4a,0x80] + vmovddup -1024(%rdx), %xmm17 + +// CHECK: vmovddup -1032(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x8a,0xf8,0xfb,0xff,0xff] + vmovddup -1032(%rdx), %xmm17 + +// CHECK: vmovddup %ymm25, %ymm18 +// CHECK: encoding: [0x62,0x81,0xff,0x28,0x12,0xd1] + vmovddup %ymm25, %ymm18 + +// CHECK: vmovddup %ymm25, %ymm18 {%k4} +// CHECK: encoding: [0x62,0x81,0xff,0x2c,0x12,0xd1] + vmovddup %ymm25, %ymm18 {%k4} + +// CHECK: vmovddup %ymm25, %ymm18 {%k4} {z} +// CHECK: encoding: [0x62,0x81,0xff,0xac,0x12,0xd1] + vmovddup %ymm25, %ymm18 {%k4} {z} + +// CHECK: vmovddup (%rcx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x11] + vmovddup (%rcx), %ymm18 + +// CHECK: vmovddup 291(%rax,%r14,8), %ymm18 +// CHECK: encoding: [0x62,0xa1,0xff,0x28,0x12,0x94,0xf0,0x23,0x01,0x00,0x00] + vmovddup 291(%rax,%r14,8), %ymm18 + +// CHECK: vmovddup 4064(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x52,0x7f] + vmovddup 4064(%rdx), %ymm18 + +// CHECK: vmovddup 4096(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x92,0x00,0x10,0x00,0x00] + vmovddup 4096(%rdx), %ymm18 + +// CHECK: vmovddup -4096(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x52,0x80] + vmovddup -4096(%rdx), %ymm18 + +// CHECK: vmovddup -4128(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x92,0xe0,0xef,0xff,0xff] + vmovddup -4128(%rdx), %ymm18 + // CHECK: vmovapd.s %xmm27, %xmm26 // CHECK: encoding: [0x62,0x01,0xfd,0x08,0x29,0xda] vmovapd.s %xmm27, %xmm26 -- 2.34.1