From: Sanjay Patel Date: Tue, 16 Dec 2014 16:30:01 +0000 (+0000) Subject: combine consecutive subvector 16-byte loads into one 32-byte load X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=8fe9488a40dd2f569549a0c395b8559e84367ee6;p=oota-llvm.git combine consecutive subvector 16-byte loads into one 32-byte load This is a fix for PR21709 ( http://llvm.org/bugs/show_bug.cgi?id=21709 ). When we have 2 consecutive 16-byte loads that are merged into one 32-byte vector, we can use a single 32-byte load instead. But we don't do this for SandyBridge / IvyBridge because they have slower 32-byte memops. We also don't bother using 32-byte *integer* loads on a machine that only has AVX1 (btver2) because those operands would have to be split in half anyway since there is no support for 32-byte integer math ops. Differential Revision: http://reviews.llvm.org/D6492 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224344 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 8e75f59ee6a..b568740df75 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -773,6 +773,7 @@ def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9fa8ce25b87..9aa6fa09db9 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8158,6 +8158,49 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), (INSERT_get_vinsert128_imm VR256:$ins))>; } +// Combine two consecutive 16-byte loads with a common destination register into +// one 32-byte load to that register. +let Predicates = [HasAVX, HasFastMem32] in { + def : Pat<(insert_subvector + (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), + (loadv4f32 (add addr:$src, (iPTR 16))), + (iPTR 4)), + (VMOVUPSYrm addr:$src)>; + + def : Pat<(insert_subvector + (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))), + (loadv2f64 (add addr:$src, (iPTR 16))), + (iPTR 2)), + (VMOVUPDYrm addr:$src)>; + + def : Pat<(insert_subvector + (v32i8 (insert_subvector + undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), + (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), + (iPTR 16)), + (VMOVDQUYrm addr:$src)>; + + def : Pat<(insert_subvector + (v16i16 (insert_subvector + undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), + (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), + (iPTR 8)), + (VMOVDQUYrm addr:$src)>; + + def : Pat<(insert_subvector + (v8i32 (insert_subvector + undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), + (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))), + (iPTR 4)), + (VMOVDQUYrm addr:$src)>; + + def : Pat<(insert_subvector + (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))), + (loadv2i64 (add addr:$src, (iPTR 16))), + (iPTR 2)), + (VMOVDQUYrm addr:$src)>; +} + let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll index 01342ba8ccb..347f330d67a 100644 --- a/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -1,7 +1,7 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load ; because that is slower than two 16-byte loads. @@ -44,3 +44,236 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) { store <8 x float> %A, <8 x float>* %P, align 16 ret void } + +; Merge two consecutive 16-byte subvector loads into a single 32-byte load +; if it's faster. + +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) + +; Use the vinsertf128 intrinsic to model source code +; that explicitly uses AVX intrinsics. +define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) + ret <8 x float> %v3 +} + +; Swap the operands of the shufflevector and vinsertf128 to ensure that the +; pattern still matches. +define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_swap + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) + ret <8 x float> %v3 +} + +; Replace the vinsertf128 intrinsic with a shufflevector as might be +; expected from auto-vectorized code. +define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> + ret <8 x float> %v3 +} + +; Swap the order of the shufflevector operands to ensure that the +; pattern still matches. +define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> + ret <8 x float> %v3 +} + +; Check each element type other than float to make sure it is handled correctly. +; Use the loaded values with an 'add' to make sure we're using the correct load type. +; Even though BtVer2 has fast 32-byte loads, we should not generate those for +; 256-bit integer vectors because BtVer2 doesn't have AVX2. + +define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i64 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddq + ; SANDYB-NEXT: vpaddq + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddq + ; BTVER2-NEXT: vpaddq + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddq + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1 + %v1 = load <2 x i64>* %ptr, align 1 + %v2 = load <2 x i64>* %ptr2, align 1 + %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> + %v4 = add <4 x i64> %v3, %x + ret <4 x i64> %v4 +} + +define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i32 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddd + ; SANDYB-NEXT: vpaddd + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddd + ; BTVER2-NEXT: vpaddd + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddd + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1 + %v1 = load <4 x i32>* %ptr, align 1 + %v2 = load <4 x i32>* %ptr2, align 1 + %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> + %v4 = add <8 x i32> %v3, %x + ret <8 x i32> %v4 +} + +define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i16 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddw + ; SANDYB-NEXT: vpaddw + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddw + ; BTVER2-NEXT: vpaddw + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddw + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1 + %v1 = load <8 x i16>* %ptr, align 1 + %v2 = load <8 x i16>* %ptr2, align 1 + %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> + %v4 = add <16 x i16> %v3, %x + ret <16 x i16> %v4 +} + +define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i8 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddb + ; SANDYB-NEXT: vpaddb + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddb + ; BTVER2-NEXT: vpaddb + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddb + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1 + %v1 = load <16 x i8>* %ptr, align 1 + %v2 = load <16 x i8>* %ptr2, align 1 + %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> + %v4 = add <32 x i8> %v3, %x + ret <32 x i8> %v4 +} + +define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) { + ; CHECK-LABEL: combine_16_byte_loads_double + + ; SANDYB: vmovupd + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: vaddpd + ; SANDYB-NEXT: retq + + ; BTVER2: vmovupd + ; BTVER2-NEXT: vaddpd + ; BTVER2-NEXT: retq + + ; HASWELL: vmovupd + ; HASWELL: vaddpd + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1 + %v1 = load <2 x double>* %ptr, align 1 + %v2 = load <2 x double>* %ptr2, align 1 + %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> + %v4 = fadd <4 x double> %v3, %x + ret <4 x double> %v4 +} +