prevent folding a scalar FP load into a packed logical FP instruction (PR22371)

author Sanjay Patel <spatel@rotateright.com>

Tue, 17 Feb 2015 20:08:21 +0000 (20:08 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 17 Feb 2015 20:08:21 +0000 (20:08 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 17 Feb 2015 20:08:21 +0000 (20:08 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 17 Feb 2015 20:08:21 +0000 (20:08 +0000)
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td

index c385b1962ef13fa1934db75b25a0c2ac795bd306..1f4b49f70997e1567887bcca0a17214cdca01042 100644 (file)
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -366,6 +366,15 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
  def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
  def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
  
+// These are needed to match a scalar load that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def loadf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>;
+def loadf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>;
+
  // Like 'store', but always requires 128-bit vector alignment.
  def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                             (store node:$val, node:$ptr), [{
@@ -457,6 +466,16 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
  def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
  def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
  
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>;
+
+
  // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
  // 16-byte boundary.
  // FIXME: 8 byte alignment for mmx reads is not required
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp

index 57a078ef95c085966f7a4194a3c4e21d6d7d0b1f..f2f11c0004e8b427cd188ae30ce67e57feee5932 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -933,6 +933,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   0 },
      { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
      { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
+
+    // FIXME: We should not be folding Fs* scalar loads into vector
+    // instructions because the vector instructions require vector-sized
+    // loads. Lowering should create vector-sized instructions (the Fv*
+    // variants below) to allow load folding.
      { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
      { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
      { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
@@ -941,6 +946,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
      { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
      { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+
+    { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
+    { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
+    { X86::FvANDPDrr,       X86::FvANDPDrm,     TB_ALIGN_16 },
+    { X86::FvANDPSrr,       X86::FvANDPSrm,     TB_ALIGN_16 },
+    { X86::FvORPDrr,        X86::FvORPDrm,      TB_ALIGN_16 },
+    { X86::FvORPSrr,        X86::FvORPSrm,      TB_ALIGN_16 },
+    { X86::FvXORPDrr,       X86::FvXORPDrm,     TB_ALIGN_16 },
+    { X86::FvXORPSrr,       X86::FvXORPSrm,     TB_ALIGN_16 },
      { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
      { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
      { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
@@ -1142,14 +1156,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
      { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       0 },
      { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
      { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
-    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        0 },
-    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        0 },
-    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         0 },
-    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         0 },
-    { X86::VFsORPDrr,         X86::VFsORPDrm,          0 },
-    { X86::VFsORPSrr,         X86::VFsORPSrm,          0 },
-    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         0 },
-    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         0 },
+    // Do not fold VFs* loads because there are no scalar load variants for
+    // these instructions. When folded, the load is required to be 128-bits, so
+    // the load size would not match.
+    { X86::VFvANDNPDrr,       X86::VFvANDNPDrm,        0 },
+    { X86::VFvANDNPSrr,       X86::VFvANDNPSrm,        0 },
+    { X86::VFvANDPDrr,        X86::VFvANDPDrm,         0 },
+    { X86::VFvANDPSrr,        X86::VFvANDPSrm,         0 },
+    { X86::VFvORPDrr,         X86::VFvORPDrm,          0 },
+    { X86::VFvORPSrr,         X86::VFvORPSrm,          0 },
+    { X86::VFvXORPDrr,        X86::VFvXORPDrm,         0 },
+    { X86::VFvXORPSrr,        X86::VFvXORPSrm,         0 },
      { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
      { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
      { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 84c67c46021949249beb78b3ff90ad4662da0466..9d2c83001d532f215ceb9ca23019f22cd2cc89e2 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2874,21 +2874,19 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
  multiclass sse12_fp_packed_scalar_logical_alias<
      bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
    defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-              FR32, f32, f128mem, loadf32, SSEPackedSingle, itins, 0>,
-              PS, VEX_4V;
+                FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
+                PS, VEX_4V;
  
    defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-        FR64, f64, f128mem, loadf64, SSEPackedDouble, itins, 0>,
-        PD, VEX_4V;
+                FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
+                PD, VEX_4V;
  
    let Constraints = "$src1 = $dst" in {
      defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
-                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
-                PS;
+                f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
  
      defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
-                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
-                PD;
+                f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
    }
  }
  
diff --git a/test/CodeGen/X86/logical-load-fold.ll b/test/CodeGen/X86/logical-load-fold.ll

new file mode 100644 (file)

index 0000000..6051a5e
--- /dev/null
+++ b/test/CodeGen/X86/logical-load-fold.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=sse2,sse-unaligned-mem | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=avx                    | FileCheck %s --check-prefix=AVX
+
+; Although we have the ability to fold an unaligned load with AVX 
+; and under special conditions with some SSE implementations, we
+; can not fold the load under any circumstances in these test
+; cases because they are not 16-byte loads. The load must be
+; executed as a scalar ('movs*') with a zero extension to
+; 128-bits and then used in the packed logical ('andp*') op. 
+; PR22371 - http://llvm.org/bugs/show_bug.cgi?id=22371
+
+define double @load_double_no_fold(double %x, double %y) {
+; SSE2-LABEL: load_double_no_fold:
+; SSE2:       ## BB#0:
+; SSE2-NEXT:    cmplesd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    andpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: load_double_no_fold:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %cmp = fcmp oge double %x, %y
+  %zext = zext i1 %cmp to i32
+  %conv = sitofp i32 %zext to double
+  ret double %conv
+}
+
+define float @load_float_no_fold(float %x, float %y) {
+; SSE2-LABEL: load_float_no_fold:
+; SSE2:       ## BB#0:
+; SSE2-NEXT:    cmpless %xmm0, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: load_float_no_fold:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+
+  %cmp = fcmp oge float %x, %y
+  %zext = zext i1 %cmp to i32
+  %conv = sitofp i32 %zext to float
+  ret float %conv
+}
+
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll

index eafb7c29fa0aaa3a562d8d307a1948fa19ef1423..74f4c787a4eedcb3ba7e650eb52f0f58ea0c3b62 100644 (file)
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -1,7 +1,10 @@
  ; RUN: llc < %s -relocation-model=static -mcpu=yonah | FileCheck %s
  
-; The double argument is at 4(esp) which is 16-byte aligned, allowing us to
-; fold the load into the andpd.
+; The double argument is at 4(esp) which is 16-byte aligned, but we
+; are required to read in extra bytes of memory in order to fold the
+; load. Bad Things may happen when reading/processing undefined bytes,
+; so don't fold the load.
+; PR22371 / http://reviews.llvm.org/D7474
  
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
  target triple = "i686-apple-darwin8"
@@ -15,22 +18,31 @@ entry:
         %tmp = getelementptr { double, double }* %z, i32 0, i32 0               ; <double*> [#uses=1]
         %tmp1 = load volatile double* %tmp, align 8             ; <double> [#uses=1]
         %tmp2 = tail call double @fabs( double %tmp1 ) readnone ; <double> [#uses=1]
-    ; CHECK: andpd{{.*}}4(%esp), %xmm
         %tmp6 = fadd double %tmp4, %tmp2                ; <double> [#uses=1]
         store volatile double %tmp6, double* %P, align 8
         ret void
+
+; CHECK-LABEL: test:
+; CHECK:       movsd   {{.*}}G, %xmm{{.*}}
+; CHECK:       andpd   %xmm{{.*}}, %xmm{{.*}}
+; CHECK:       movsd   4(%esp), %xmm{{.*}}
+; CHECK:       andpd   %xmm{{.*}}, %xmm{{.*}}
+
+
  }
  
  define void @test2() alignstack(16) nounwind {
  entry:
-    ; CHECK: andl{{.*}}$-16, %esp
+; CHECK-LABEL: test2:
+; CHECK: andl{{.*}}$-16, %esp
      ret void
  }
  
  ; Use a call to force a spill.
  define <2 x double> @test3(<2 x double> %x, <2 x double> %y) alignstack(32) nounwind {
  entry:
-    ; CHECK: andl{{.*}}$-32, %esp
+; CHECK-LABEL: test3:
+; CHECK: andl{{.*}}$-32, %esp
      call void @test2()
      %A = fmul <2 x double> %x, %y
      ret <2 x double> %A
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 17 Feb 2015 20:08:21 +0000 (20:08 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 17 Feb 2015 20:08:21 +0000 (20:08 +0000)
lib/Target/X86/X86InstrFragmentsSIMD.td		patch \| blob \| history
lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/logical-load-fold.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/stack-align.ll		patch \| blob \| history